Пример #1
0
 def __init__(self,
              taskBuffer,
              jobID,
              jobStatus,
              xmlFile,
              ignoreTmpError=True,
              siteMapper=None):
     self.job = None
     self.jobID = jobID
     self.jobStatus = jobStatus
     self.taskBuffer = taskBuffer
     self.ignoreTmpError = ignoreTmpError
     self.lockXML = None
     self.siteMapper = siteMapper
     self.attemptNr = None
     self.xmlFile = xmlFile
     self.datasetMap = {}
     self.extraInfo = {
         'surl': {},
         'nevents': {},
         'lbnr': {},
         'endpoint': {}
     }
     # exstract attemptNr
     try:
         tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1]
         if re.search('^\d+$', tmpAttemptNr) != None:
             self.attemptNr = int(tmpAttemptNr)
     except:
         pass
     # logger
     self.logger = LogWrapper(_logger, str(self.jobID))
Пример #2
0
 def __init__(self,taskBuffer,siteMapper,evpFileName,ignoreError):
     self.taskBuffer      = taskBuffer
     self.siteMapper      = siteMapper
     self.ignoreError     = ignoreError
     self.evpFileName     = evpFileName
     self.token           = datetime.datetime.utcnow().isoformat(' ')        
     # logger
     self.logger = LogWrapper(_logger,self.token)
     self.pd2p            = DynDataDistributer.DynDataDistributer([],self.taskBuffer,self.siteMapper,
                                                                  token=' ',logger=self.logger)
     self.userDatasetName = ''
     self.creationTime    = ''
     self.params          = ''
     self.lockedBy        = ''
     self.evpFile         = None
     self.userTaskName    = ''
     # message buffer
     self.msgBuffer       = []
     self.lineLimit       = 100
     # JEDI
     self.jediTaskID      = None
Пример #3
0
def make_logger(tmp_log, token=None, method_name=None, hook=None):
    # get method name of caller
    if method_name is None:
        tmpStr = inspect.stack()[1][3]
    else:
        tmpStr = method_name
    if token is not None:
        tmpStr += ' <{0}>'.format(token)
    else:
        tmpStr += ' :'.format(token)
    newLog = LogWrapper(tmp_log, tmpStr, seeMem=with_memory_profile, hook=hook)
    return newLog
class CloserAtlasPlugin:

    # constructor
    def __init__(self,job,datasets,log):
        self.jobSpec = job
        self.datasets = datasets
        self.tmpLog = LogWrapper(log,"{0} CloserAtlasPlugin".format(self.jobSpec.PandaID))


        
    # execute
    def execute(self):
        try:
            # only for production
            if not self.jobSpec.prodSourceLabel in ['managed','test']:
                return True
            # only for urgent or high prio
            if not self.jobSpec.processingType in ['urgent'] and self.jobSpec.currentPriority <= 1000:
                return True
            # close datasets
            for datasetSpec in self.datasets:
                if re.search('_sub\d+$',datasetSpec.name) == None:
                    continue
                if datasetSpec.status != 'tobeclosed':
                    continue
                try:
                    self.tmpLog.debug('immediate close {0}'.format(datasetSpec.name))
                    rucioAPI.closeDataset(datasetSpec.name)
                except:
                    errtype,errvalue = sys.exc_info()[:2]
                    self.tmpLog.warning('failed to close : {0} {1}'.format(errtype,errvalue))
        except:
            errtype,errvalue = sys.exc_info()[:2]
            self.tmpLog.warning('failed to execute : {0} {1}'.format(errtype,errvalue))
        return True
Пример #5
0
 def __init__(self,taskBuffer,jobID,jobStatus,xmlFile,ignoreTmpError=True,siteMapper=None):
     self.job = None
     self.jobID = jobID
     self.jobStatus = jobStatus
     self.taskBuffer = taskBuffer
     self.ignoreTmpError = ignoreTmpError
     self.lockXML = None
     self.siteMapper = siteMapper
     self.attemptNr = None
     self.xmlFile = xmlFile
     self.datasetMap = {}
     self.extraInfo = {'surl':{},'nevents':{},'lbnr':{},'endpoint':{}, 'guid':{}}
     # exstract attemptNr
     try:
         tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1]
         if re.search('^\d+$',tmpAttemptNr) != None:
             self.attemptNr = int(tmpAttemptNr)
     except:
         pass
     # logger
     self.logger = LogWrapper(_logger,str(self.jobID))
Пример #6
0
 def __init__(self,taskBuffer,siteMapper,evpFileName,ignoreError):
     self.taskBuffer      = taskBuffer
     self.siteMapper      = siteMapper
     self.ignoreError     = ignoreError
     self.evpFileName     = evpFileName
     self.token           = datetime.datetime.utcnow().isoformat(' ')        
     # logger
     self.logger = LogWrapper(_logger,self.token)
     self.pd2p            = DynDataDistributer.DynDataDistributer([],self.taskBuffer,self.siteMapper,
                                                                  token=' ',logger=self.logger)
     self.userDatasetName = ''
     self.creationTime    = ''
     self.params          = ''
     self.lockedBy        = ''
     self.evpFile         = None
     self.userTaskName    = ''
     # message buffer
     self.msgBuffer       = []
     self.lineLimit       = 100
     # JEDI
     self.jediTaskID      = None
Пример #7
0
class AdderGen:
    # constructor
    def __init__(self,
                 taskBuffer,
                 jobID,
                 jobStatus,
                 xmlFile,
                 ignoreTmpError=True,
                 siteMapper=None):
        self.job = None
        self.jobID = jobID
        self.jobStatus = jobStatus
        self.taskBuffer = taskBuffer
        self.ignoreTmpError = ignoreTmpError
        self.lockXML = None
        self.siteMapper = siteMapper
        self.attemptNr = None
        self.xmlFile = xmlFile
        self.datasetMap = {}
        self.extraInfo = {
            'surl': {},
            'nevents': {},
            'lbnr': {},
            'endpoint': {}
        }
        # exstract attemptNr
        try:
            tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1]
            if re.search('^\d+$', tmpAttemptNr) != None:
                self.attemptNr = int(tmpAttemptNr)
        except:
            pass
        # logger
        self.logger = LogWrapper(_logger, str(self.jobID))

    # dump file report
    def dumpFileReport(self, fileCatalog, attemptNr):
        self.logger.debug("dump file report")
        # dump Catalog into file
        if attemptNr == None:
            xmlFile = '%s/%s_%s_%s' % (panda_config.logdir, self.jobID,
                                       self.jobStatus, str(uuid.uuid4()))
        else:
            xmlFile = '%s/%s_%s_%s_%s' % (panda_config.logdir, self.jobID,
                                          self.jobStatus, str(
                                              uuid.uuid4()), attemptNr)
        file = open(xmlFile, 'w')
        file.write(fileCatalog)
        file.close()

    # get plugin class
    def getPluginClass(self, tmpVO):
        # instantiate concrete plugin
        adderPluginClass = panda_config.getPlugin('adder_plugins', tmpVO)
        if adderPluginClass == None:
            # use ATLAS plugin by default
            from AdderAtlasPlugin import AdderAtlasPlugin
            adderPluginClass = AdderAtlasPlugin
        self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__))
        return adderPluginClass

    # main
    def run(self):
        try:
            self.logger.debug("new start: %s attemptNr=%s" %
                              (self.jobStatus, self.attemptNr))
            # lock XML
            self.lockXML = open(self.xmlFile)
            try:
                fcntl.flock(self.lockXML.fileno(),
                            fcntl.LOCK_EX | fcntl.LOCK_NB)
            except:
                self.logger.debug("cannot get lock : %s" % self.xmlFile)
                self.lockXML.close()
                # remove XML just in case for the final attempt
                if not self.ignoreTmpError:
                    try:
                        # remove Catalog
                        os.remove(self.xmlFile)
                    except:
                        pass
                return
            # check if file exists
            if not os.path.exists(self.xmlFile):
                self.logger.debug("not exist : %s" % self.xmlFile)
                try:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                    self.lockXML.close()
                except:
                    pass
                return
            # query job
            self.job = self.taskBuffer.peekJobs([self.jobID],
                                                fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
            # check if job has finished
            if self.job == None:
                self.logger.debug(': job not found in DB')
            elif self.job.jobStatus in [
                    'finished', 'failed', 'unknown', 'merging'
            ]:
                self.logger.error(': invalid state -> %s' % self.job.jobStatus)
            elif self.attemptNr != None and self.job.attemptNr != self.attemptNr:
                self.logger.error('wrong attemptNr -> job=%s <> %s' %
                                  (self.job.attemptNr, self.attemptNr))
            elif self.attemptNr is not None and self.job.jobStatus == 'transferring':
                errMsg = 'XML with attemptNr for {0}'.format(
                    self.job.jobStatus)
                self.logger.error(errMsg)
                # FIXME
                raise RuntimeError, errMsg
            elif self.jobStatus == EventServiceUtils.esRegStatus:
                # instantiate concrete plugin
                adderPluginClass = self.getPluginClass(self.job.VO)
                adderPlugin = adderPluginClass(self.job,
                                               taskBuffer=self.taskBuffer,
                                               siteMapper=self.siteMapper,
                                               logger=self.logger)
                # execute
                self.logger.debug('plugin is ready for ES file registration')
                adderPlugin.registerEventServiceFiles()
            else:
                # check file status in JEDI
                if not self.job.isCancelled(
                ) and not self.job.taskBufferErrorCode in [
                        taskbuffer.ErrorCode.EC_PilotRetried
                ]:
                    fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(
                        self.job)
                    self.logger.debug("check file status in JEDI : {0}".format(
                        fileCheckInJEDI))
                    if fileCheckInJEDI == None:
                        raise RuntimeError, 'failed to check file status in JEDI'
                    if fileCheckInJEDI == False:
                        # set job status to failed since some file status is wrong in JEDI
                        self.jobStatus = 'failed'
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        errStr = "inconsistent file status between Panda and JEDI. "
                        errStr += "failed to avoid duplicated processing caused by synchronization failure"
                        self.job.ddmErrorDiag = errStr
                        self.logger.debug(
                            "set jobStatus={0} since input is inconsistent between Panda and JEDI"
                            .format(self.jobStatus))
                    elif self.job.jobSubStatus in ['pilot_closed']:
                        # terminated by the pilot
                        self.logger.debug(
                            "going to closed since terminated by the pilot")
                        retClosed = self.taskBuffer.killJobs([self.jobID],
                                                             'pilot', '60',
                                                             True)
                        if retClosed[0] == True:
                            self.logger.debug("end")
                            try:
                                # remove Catalog
                                os.remove(self.xmlFile)
                            except:
                                pass
                            # unlock XML
                            if self.lockXML != None:
                                fcntl.flock(self.lockXML.fileno(),
                                            fcntl.LOCK_UN)
                                self.lockXML.close()
                            return
                    # check for cloned jobs
                    if EventServiceUtils.isJobCloningJob(self.job):
                        checkJC = self.taskBuffer.checkClonedJob(self.job)
                        if checkJC == None:
                            raise RuntimeError, 'failed to check the cloned job'
                        # failed to lock semaphore
                        if checkJC['lock'] == False:
                            self.jobStatus = 'failed'
                            self.job.ddmErrorCode = ErrorCode.EC_Adder
                            self.job.ddmErrorDiag = "failed to lock semaphore for job cloning"
                            self.logger.debug(
                                "set jobStatus={0} since did not get semaphore for job cloning"
                                .format(self.jobStatus))
                # use failed for cancelled/closed jobs
                if self.job.isCancelled():
                    self.jobStatus = 'failed'
                    # reset error codes to skip retrial module
                    self.job.pilotErrorCode = 0
                    self.job.exeErrorCode = 0
                    self.job.ddmErrorCode = 0
                # keep old status
                oldJobStatus = self.job.jobStatus
                # set job status
                if not self.job.jobStatus in ['transferring']:
                    self.job.jobStatus = self.jobStatus
                addResult = None
                adderPlugin = None
                # parse XML
                parseResult = self.parseXML()
                if parseResult < 2:
                    # intraction with DDM
                    try:
                        # instantiate concrete plugin
                        adderPluginClass = self.getPluginClass(self.job.VO)
                        adderPlugin = adderPluginClass(
                            self.job,
                            taskBuffer=self.taskBuffer,
                            siteMapper=self.siteMapper,
                            extraInfo=self.extraInfo,
                            logger=self.logger)
                        # execute
                        self.logger.debug('plugin is ready')
                        adderPlugin.execute()
                        addResult = adderPlugin.result
                        self.logger.debug('plugin done with %s' %
                                          (addResult.statusCode))
                    except:
                        errtype, errvalue = sys.exc_info()[:2]
                        self.logger.error(
                            "failed to execute AdderPlugin for VO={0} with {1}:{2}"
                            .format(self.job.VO, errtype, errvalue))
                        addResult = None
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "AdderPlugin failure"

                    # ignore temporary errors
                    if self.ignoreTmpError and addResult != None and addResult.isTemporary(
                    ):
                        self.logger.debug(': ignore %s ' %
                                          self.job.ddmErrorDiag)
                        self.logger.debug('escape')
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type, value))
                            self.logger.debug("cannot unlock XML")
                        return
                    # failed
                    if addResult == None or not addResult.isSucceeded():
                        self.job.jobStatus = 'failed'
                # set file status for failed jobs or failed transferring jobs
                self.logger.debug(
                    "status after plugin call :job.jobStatus=%s jobStatus=%s" %
                    (self.job.jobStatus, self.jobStatus))
                if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                    # First of all: check if job failed and in this case take first actions according to error table
                    source, error_code, error_diag = None, None, None
                    if self.job.pilotErrorCode:
                        source = 'pilotErrorCode'
                        error_code = self.job.pilotErrorCode
                        error_diag = self.job.pilotErrorDiag
                    elif self.job.exeErrorCode:
                        source = 'exeErrorCode'
                        error_code = self.job.exeErrorCode
                        error_diag = self.job.exeErrorDiag
                    elif self.job.ddmErrorCode:
                        source = 'ddmErrorCode'
                        error_code = self.job.ddmErrorCode
                        error_diag = self.job.ddmErrorDiag
                    elif self.job.transExitCode:
                        source = 'transExitCode'
                        error_code = self.job.transExitCode
                        error_diag = ''

                    # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag))

                    if source and error_code:
                        try:
                            self.logger.debug(
                                "AdderGen.run will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, self.job.PandaID, source,
                                error_code, error_diag, self.job.attemptNr)
                            self.logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            self.logger.error(
                                "apply_retrial_rules excepted and needs to be investigated (%s): %s"
                                % (e, traceback.format_exc()))

                    self.job.jobStatus = 'failed'
                    for file in self.job.Files:
                        if file.type in ['output', 'log']:
                            if addResult != None and file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                            else:
                                file.status = 'failed'
                else:
                    # reset errors
                    self.job.jobDispatcherErrorCode = 0
                    self.job.jobDispatcherErrorDiag = 'NULL'
                    # set status
                    if addResult != None and addResult.mergingFiles != []:
                        # set status for merging:
                        for file in self.job.Files:
                            if file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                        self.job.jobStatus = 'merging'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.gmtime())
                    elif addResult != None and addResult.transferringFiles != []:
                        # set status for transferring
                        for file in self.job.Files:
                            if file.lfn in addResult.transferringFiles:
                                file.status = 'transferring'
                        self.job.jobStatus = 'transferring'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.gmtime())
                    else:
                        self.job.jobStatus = 'finished'
                # endtime
                if self.job.endTime == 'NULL':
                    self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                                     time.gmtime())
                # output size and # of outputs
                self.job.nOutputDataFiles = 0
                self.job.outputFileBytes = 0
                for tmpFile in self.job.Files:
                    if tmpFile.type == 'output':
                        self.job.nOutputDataFiles += 1
                        try:
                            self.job.outputFileBytes += tmpFile.fsize
                        except:
                            pass
                # protection
                maxOutputFileBytes = 99999999999
                if self.job.outputFileBytes > maxOutputFileBytes:
                    self.job.outputFileBytes = maxOutputFileBytes
                # set cancelled state
                if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                    self.job.jobStatus = 'cancelled'
                # update job
                if oldJobStatus in ['cancelled', 'closed']:
                    pass
                else:
                    self.logger.debug("updating DB")
                    retU = self.taskBuffer.updateJobs(
                        [self.job],
                        False,
                        oldJobStatusList=[oldJobStatus],
                        extraInfo=self.extraInfo)
                    self.logger.debug("retU: %s" % retU)
                    # failed
                    if not retU[0]:
                        self.logger.error(
                            'failed to update DB for pandaid={0}'.format(
                                self.job.PandaID))
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type, value))
                            self.logger.debug("cannot unlock XML")
                        return

                    try:
                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        self.logger.debug("AdderGen.run will peek the job")
                        job_tmp = self.taskBuffer.peekJobs(
                            [self.job.PandaID],
                            fromDefined=False,
                            fromArchived=True,
                            fromWaiting=False)[0]
                        self.logger.debug(
                            "status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}"
                            .format(job_tmp.jobStatus,
                                    job_tmp.taskBufferErrorCode,
                                    job_tmp.taskBufferErrorDiag))
                        if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode:
                            source = 'taskBufferErrorCode'
                            error_code = job_tmp.taskBufferErrorCode
                            error_diag = job_tmp.taskBufferErrorDiag
                            self.logger.debug(
                                "AdderGen.run 2 will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, job_tmp.PandaID, source,
                                error_code, error_diag, job_tmp.attemptNr)
                            self.logger.debug("apply_retrial_rules 2 is back")
                    except IndexError:
                        pass
                    except Exception as e:
                        self.logger.error(
                            "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s"
                            % (e, traceback.format_exc()))

                    # setup for closer
                    if not (EventServiceUtils.isEventServiceJob(self.job)
                            and self.job.isCancelled()):
                        destDBList = []
                        guidList = []
                        for file in self.job.Files:
                            # ignore inputs
                            if file.type == 'input':
                                continue
                            # skip pseudo datasets
                            if file.destinationDBlock in ['', None, 'NULL']:
                                continue
                            # start closer for output/log datasets
                            if not file.destinationDBlock in destDBList:
                                destDBList.append(file.destinationDBlock)
                            # collect GUIDs
                            if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \
                                                                      self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                                      and file.type == 'output':
                                # extract base LFN since LFN was changed to full LFN for CMS
                                baseLFN = file.lfn.split('/')[-1]
                                guidList.append({
                                    'lfn': baseLFN,
                                    'guid': file.GUID,
                                    'type': file.type,
                                    'checksum': file.checksum,
                                    'md5sum': file.md5sum,
                                    'fsize': file.fsize,
                                    'scope': file.scope
                                })
                        if guidList != []:
                            retG = self.taskBuffer.setGUIDs(guidList)
                        if destDBList != []:
                            # start Closer
                            if adderPlugin != None and hasattr(
                                    adderPlugin, 'datasetMap'
                            ) and adderPlugin.datasetMap != {}:
                                cThr = Closer.Closer(
                                    self.taskBuffer,
                                    destDBList,
                                    self.job,
                                    datasetMap=adderPlugin.datasetMap)
                            else:
                                cThr = Closer.Closer(self.taskBuffer,
                                                     destDBList, self.job)
                            self.logger.debug("start Closer")
                            cThr.start()
                            cThr.join()
                            self.logger.debug("end Closer")
                        # run closer for assocaiate parallel jobs
                        if EventServiceUtils.isJobCloningJob(self.job):
                            assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer(
                                self.job.jediTaskID, self.job.PandaID,
                                destDBList)
                            for assJobID, assDBlocks in assDBlockMap.iteritems(
                            ):
                                assJob = self.taskBuffer.peekJobs(
                                    [assJobID],
                                    fromDefined=False,
                                    fromArchived=False,
                                    fromWaiting=False,
                                    forAnal=True)[0]
                                if self.job == None:
                                    self.logger.debug(
                                        ': associated job PandaID={0} not found in DB'
                                        .format(assJobID))
                                else:
                                    cThr = Closer.Closer(
                                        self.taskBuffer, assDBlocks, assJob)
                                    self.logger.debug(
                                        "start Closer for PandaID={0}".format(
                                            assJobID))
                                    cThr.start()
                                    cThr.join()
                                    self.logger.debug(
                                        "end Closer for PandaID={0}".format(
                                            assJobID))
            self.logger.debug("end")
            try:
                # remove Catalog
                os.remove(self.xmlFile)
            except:
                pass
            # unlock XML
            if self.lockXML != None:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                self.lockXML.close()
        except:
            type, value, traceBack = sys.exc_info()
            errStr = ": %s %s " % (type, value)
            errStr += traceback.format_exc()
            self.logger.error(errStr)
            self.logger.error("except")
            # unlock XML just in case
            try:
                if self.lockXML != None:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
            except:
                type, value, traceBack = sys.exc_info()
                self.logger.error(": %s %s" % (type, value))
                self.logger.error("cannot unlock XML")

    # parse XML
    # 0: succeeded, 1: harmless error to exit, 2: fatal error, 3: event service
    def parseXML(self):
        # get LFN and GUID
        self.logger.debug('XML filename : %s' % self.xmlFile)
        # no outputs
        if self.job.Files == []:
            self.logger.debug("has no outputs")
            self.logger.debug("parseXML end")
            return 0
        # get input files
        inputLFNs = []
        for file in self.job.Files:
            if file.type == 'input':
                inputLFNs.append(file.lfn)
        # parse XML
        lfns = []
        guids = []
        fsizes = []
        md5sums = []
        chksums = []
        surls = []
        fullLfnMap = {}
        nEventsMap = {}
        try:
            root = xml.dom.minidom.parse(self.xmlFile)
            files = root.getElementsByTagName('File')
            for file in files:
                # get GUID
                guid = str(file.getAttribute('ID'))
                # get PFN and LFN nodes
                logical = file.getElementsByTagName('logical')[0]
                lfnNode = logical.getElementsByTagName('lfn')[0]
                # convert UTF8 to Raw
                lfn = str(lfnNode.getAttribute('name'))
                # get metadata
                fsize = None
                md5sum = None
                adler32 = None
                surl = None
                fullLFN = None
                for meta in file.getElementsByTagName('metadata'):
                    # get fsize
                    name = str(meta.getAttribute('att_name'))
                    if name == 'fsize':
                        fsize = long(meta.getAttribute('att_value'))
                    elif name == 'md5sum':
                        md5sum = str(meta.getAttribute('att_value'))
                        # check
                        if re.search("^[a-fA-F0-9]{32}$", md5sum) == None:
                            md5sum = None
                    elif name == 'adler32':
                        adler32 = str(meta.getAttribute('att_value'))
                    elif name == 'surl':
                        surl = str(meta.getAttribute('att_value'))
                    elif name == 'full_lfn':
                        fullLFN = str(meta.getAttribute('att_value'))
                # endpoints
                self.extraInfo['endpoint'][lfn] = []
                for epNode in file.getElementsByTagName('endpoint'):
                    self.extraInfo['endpoint'][lfn].append(
                        str(epNode.firstChild.data))
                # error check
                if (not lfn in inputLFNs) and (fsize == None or
                                               (md5sum == None
                                                and adler32 == None)):
                    if EventServiceUtils.isEventServiceMerge(self.job):
                        continue
                    else:
                        raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
                # append
                lfns.append(lfn)
                guids.append(guid)
                fsizes.append(fsize)
                md5sums.append(md5sum)
                surls.append(surl)
                if adler32 != None:
                    # use adler32 if available
                    chksums.append("ad:%s" % adler32)
                else:
                    chksums.append("md5:%s" % md5sum)
                if fullLFN != None:
                    fullLfnMap[lfn] = fullLFN
        except:
            # parse json
            try:
                import json
                with open(self.xmlFile) as tmpF:
                    jsonDict = json.load(tmpF)
                    for lfn, fileData in jsonDict.iteritems():
                        lfn = str(lfn)
                        fsize = None
                        md5sum = None
                        adler32 = None
                        surl = None
                        fullLFN = None
                        guid = str(fileData['guid'])
                        if 'fsize' in fileData:
                            fsize = long(fileData['fsize'])
                        if 'md5sum' in fileData:
                            md5sum = str(fileData['md5sum'])
                            # check
                            if re.search("^[a-fA-F0-9]{32}$", md5sum) == None:
                                md5sum = None
                        if 'adler32' in fileData:
                            adler32 = str(fileData['adler32'])
                        if 'surl' in fileData:
                            surl = str(fileData['surl'])
                        if 'full_lfn' in fileData:
                            fullLFN = str(fileData['full_lfn'])
                        # endpoints
                        self.extraInfo['endpoint'][lfn] = []
                        if 'endpoint' in fileData:
                            self.extraInfo['endpoint'][lfn] = fileData[
                                'endpoint']
                        # error check
                        if (not lfn in inputLFNs) and (fsize == None or
                                                       (md5sum == None
                                                        and adler32 == None)):
                            if EventServiceUtils.isEventServiceMerge(self.job):
                                continue
                            else:
                                raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
                        # append
                        lfns.append(lfn)
                        guids.append(guid)
                        fsizes.append(fsize)
                        md5sums.append(md5sum)
                        surls.append(surl)
                        if adler32 != None:
                            # use adler32 if available
                            chksums.append("ad:%s" % adler32)
                        else:
                            chksums.append("md5:%s" % md5sum)
                        if fullLFN != None:
                            fullLfnMap[lfn] = fullLFN
            except:
                # check if file exists
                if os.path.exists(self.xmlFile):
                    type, value, traceBack = sys.exc_info()
                    self.logger.error(": %s %s" % (type, value))
                    # set failed anyway
                    self.job.jobStatus = 'failed'
                    # XML error happens when pilot got killed due to wall-time limit or failures in wrapper
                    if (self.job.pilotErrorCode in [0,'0','NULL']) and \
                       (self.job.transExitCode  in [0,'0','NULL']):
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML"
                    return 2
                else:
                    # XML was deleted
                    return 1
        # parse metadata to get nEvents
        try:
            root = xml.dom.minidom.parseString(self.job.metadata)
            files = root.getElementsByTagName('File')
            for file in files:
                # get GUID
                guid = str(file.getAttribute('ID'))
                # get PFN and LFN nodes
                logical = file.getElementsByTagName('logical')[0]
                lfnNode = logical.getElementsByTagName('lfn')[0]
                # convert UTF8 to Raw
                lfn = str(lfnNode.getAttribute('name'))
                # get metadata
                nevents = None
                for meta in file.getElementsByTagName('metadata'):
                    # get fsize
                    name = str(meta.getAttribute('att_name'))
                    if name == 'events':
                        nevents = long(meta.getAttribute('att_value'))
                        nEventsMap[lfn] = nevents
                        break
        except:
            pass
        self.logger.debug('nEventsMap=%s' % str(nEventsMap))
        # parse json
        try:
            import json
            jsonDict = json.loads(self.job.metadata)
            for jsonFileItem in jsonDict['files']['output']:
                for jsonSubFileItem in jsonFileItem['subFiles']:
                    lfn = str(jsonSubFileItem['name'])
                    try:
                        nevents = long(jsonSubFileItem['nentries'])
                        nEventsMap[lfn] = nevents
                    except:
                        pass
        except:
            pass
        self.logger.debug('nEventsMapJson=%s' % str(nEventsMap))
        # get lumi block number
        lumiBlockNr = self.job.getLumiBlockNr()
        # copy files for variable number of outputs
        tmpStat = self.copyFilesForVariableNumOutputs(lfns)
        if not tmpStat:
            self.logger.error(
                "failed to copy files for variable number of outputs")
            return 2
        # check files
        fileList = []
        for file in self.job.Files:
            fileList.append(file.lfn)
            if file.type == 'input':
                if file.lfn in lfns:
                    if self.job.prodSourceLabel in ['user', 'panda']:
                        # skipped file
                        file.status = 'skipped'
                    elif self.job.prodSourceLabel in [
                            'managed', 'test', 'rc_test', 'ptest'
                    ]:
                        # failed by pilot
                        file.status = 'failed'
            elif file.type == 'output' or file.type == 'log':
                # add only log file for failed jobs
                if self.jobStatus == 'failed' and file.type != 'log':
                    file.status = 'failed'
                    continue
                # set failed if it is missing in XML
                if not file.lfn in lfns:
                    if self.job.jobStatus == 'finished' and \
                            (EventServiceUtils.isEventServiceJob(self.job) or EventServiceUtils.isJumboJob(self.job)):
                        # unset file status for ES jobs
                        pass
                    elif file.isAllowedNoOutput():
                        # allowed not to be produced
                        file.status = 'nooutput'
                        self.logger.debug('set {0} to status={1}'.format(
                            file.lfn, file.status))
                    else:
                        file.status = 'failed'
                        self.job.jobStatus = 'failed'
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format(
                            file.lfn)
                        self.logger.error(self.job.ddmErrorDiag)
                    continue
                # look for GUID with LFN
                try:
                    i = lfns.index(file.lfn)
                    file.GUID = guids[i]
                    file.fsize = fsizes[i]
                    file.md5sum = md5sums[i]
                    file.checksum = chksums[i]
                    surl = surls[i]
                    # status
                    file.status = 'ready'
                    # change to full LFN
                    if fullLfnMap.has_key(file.lfn):
                        file.lfn = fullLfnMap[file.lfn]
                    # add SURL to extraInfo
                    self.extraInfo['surl'][file.lfn] = surl
                    # add nevents
                    if nEventsMap.has_key(file.lfn):
                        self.extraInfo['nevents'][file.lfn] = nEventsMap[
                            file.lfn]
                except:
                    # status
                    file.status = 'failed'
                    type, value, traceBack = sys.exc_info()
                    self.logger.error(": %s %s" % (type, value))
                # set lumi block number
                if lumiBlockNr != None and file.status != 'failed':
                    self.extraInfo['lbnr'][file.lfn] = lumiBlockNr
        # check consistency between XML and filesTable
        for lfn in lfns:
            if not lfn in fileList:
                self.logger.error("%s is not found in filesTable" % lfn)
                self.job.jobStatus = 'failed'
                for tmpFile in self.job.Files:
                    tmpFile.status = 'failed'
                self.job.ddmErrorCode = ErrorCode.EC_Adder
                self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format(
                    lfn)
                return 2
        # return
        self.logger.debug("parseXML end")
        return 0

    # copy files for variable number of outputs
    def copyFilesForVariableNumOutputs(self, lfns):
        # get original output files
        origOutputs = {}
        updateOrig = {}
        for tmpFile in self.job.Files:
            if tmpFile.type in ['output', 'log']:
                origOutputs[tmpFile.lfn] = tmpFile
                if tmpFile.lfn in lfns:
                    # keep original
                    updateOrig[tmpFile.lfn] = False
                else:
                    # overwrite original
                    updateOrig[tmpFile.lfn] = True
        # look for unkown files
        addedNewFiles = False
        for newLFN in lfns:
            if not newLFN in origOutputs:
                # look for corresponding original output
                for origLFN in origOutputs.keys():
                    tmpPatt = '^{0}\.*_\d+$'.format(origLFN)
                    if re.search(tmpPatt, newLFN) != None:
                        # copy file record
                        tmpStat = self.taskBuffer.copyFileRecord(
                            newLFN, origOutputs[origLFN], updateOrig[origLFN])
                        if not tmpStat:
                            return False
                        addedNewFiles = True
                        # disable further overwriting
                        updateOrig[origLFN] = False
                        break
        # refresh job info
        if addedNewFiles:
            self.job = self.taskBuffer.peekJobs([self.jobID],
                                                fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
        # return
        return True
Пример #8
0
def _getPFNFromLFC(lfns,dq2url,guids,storageName,scopeList=[],tmpLog=None):
    if tmpLog == None:
        tmpLog = LogWrapper(_log,logPrefix)
    tmpLog.debug('_getPFNFromLFC %s %s / %s LFNs:%s %s' % (dq2url,str(storageName),
                                                         len(lfns),str(lfns[:3]),str(scopeList[:3])))
    outStr = ''
    # check paramter
    if guids == [] or storageName == [] or (len(lfns) != len(guids)):
        tmpLog.debug('_getPFNFromLFC done with empty list')
        return outStr
    # check scopeList
    if not scopeList in [None,[]] and len(lfns) != len(scopeList):
        tmpLog.warning('_getPFNFromLFC wrong scopeList %s %s %s %s' % (dq2url,str(storageName),
                                                                       str(lfns),str(scopeList)))
        tmpLog.error('_getPFNFromLFC failed')
        return outStr
    # loop over all LFNs
    iLFN = 0
    nLFN = 1000
    strFiles = ''    
    outStr = ''
    for iLFN in range(len(lfns)):
        if scopeList != []:
            strFiles  += '%s %s %s\n' % (lfns[iLFN],guids[iLFN],scopeList[iLFN]) 
        else:
            strFiles  += '%s %s\n' % (lfns[iLFN],guids[iLFN]) 
        # bulk operation
        if (iLFN+1) % nLFN == 0 or (iLFN+1) >= len(lfns):
            # write to file
            inFileName = '%s/lfcin.%s'  % (panda_config.logdir,commands.getoutput('uuidgen'))
            ifile = open(inFileName,'w')
            ifile.write(strFiles)
            ifile.close()
            # construct commands
            strStorage = ''
            for storage in storageName:
                strStorage += '%s,' % storage
            strStorage = strStorage[:-1]
            com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd)            
            com+= 'unset LD_LIBRARY_PATH; unset PYTHONPATH; export PATH=/usr/local/bin:/bin:/usr/bin; '
            com+= 'source %s; %s/python -Wignore %s/LFCclient.py -f %s -l %s -s %s' % \
                  (panda_config.glite_source,panda_config.native_python32,panda_config.lfcClient_dir,
                   inFileName,dq2url,strStorage)
            tmpLog.debug(com)
            # exeute
            status,output = commands.getstatusoutput(com)
            tmpLog.debug(status)
            if status == 0:
                outStr += output
            else:
                tmpLog.error("_getPFNFromLFC : %s %s %s" % (dq2url,status,output))
                # send message to logger
                try:
                    # make message
                    message = 'LFC access : %s %s %s' % (dq2url,status,output)
                    # get logger
                    _pandaLogger = PandaLogger()
                    _pandaLogger.lock()
                    _pandaLogger.setParams({'Type':'broker_util'})
                    logger = _pandaLogger.getHttpLogger(panda_config.loggername)
                    # add message
                    logger.error(message)
                    # release HTTP handler
                    _pandaLogger.release()
                except:
                    pass
                tmpLog.error('_getPFNFromLFC failed')
                return status
            # reset
            strFiles = ''
    tmpLog.debug('_getPFNFromLFC done')
    # return
    return outStr
Пример #9
0
import taskbuffer.ErrorCode
import pandalogger.PandaLogger
from taskbuffer import EventServiceUtils
from pandalogger.PandaLogger import PandaLogger
from brokerage.SiteMapper import SiteMapper
from pandautils import PandaUtils
from pandalogger.LogWrapper import LogWrapper

# password
from config import panda_config
passwd = panda_config.dbpasswd

# logger
_logger = PandaLogger().getLogger('add')

tmpLog = LogWrapper(_logger,None)

tmpLog.debug("===================== start =====================")

# overall timeout value
overallTimeout = 20

# grace period
try:
    gracePeriod = int(sys.argv[1])
except:
    gracePeriod = 3

# current minute
currentMinute = datetime.datetime.utcnow().minute
Пример #10
0
import re
import sys
import datetime
import traceback
from taskbuffer.TaskBuffer import taskBuffer
from pandalogger.PandaLogger import PandaLogger
from pandalogger.LogWrapper import LogWrapper
from brokerage.SiteMapper import SiteMapper

# password
from config import panda_config
passwd = panda_config.dbpasswd

# logger
_logger = PandaLogger().getLogger('prioryMassage')
tmpLog = LogWrapper(_logger)

tmpLog.debug("================= start ==================")

# instantiate TB
taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)

# instantiate sitemapper
siteMapper = SiteMapper(taskBuffer)

# get usage breakdown
usageBreakDownPerUser = {}
usageBreakDownPerSite = {}
workingGroupList = []
for table in ['ATLAS_PANDA.jobsActive4', 'ATLAS_PANDA.jobsArchived4']:
    varMap = {}
Пример #11
0
class EventPicker:
    # constructor
    def __init__(self,taskBuffer,siteMapper,evpFileName,ignoreError):
        self.taskBuffer      = taskBuffer
        self.siteMapper      = siteMapper
        self.ignoreError     = ignoreError
        self.evpFileName     = evpFileName
        self.token           = datetime.datetime.utcnow().isoformat(' ')        
        # logger
        self.logger = LogWrapper(_logger,self.token)
        self.pd2p            = DynDataDistributer.DynDataDistributer([],self.taskBuffer,self.siteMapper,
                                                                     token=' ',logger=self.logger)
        self.userDatasetName = ''
        self.creationTime    = ''
        self.params          = ''
        self.lockedBy        = ''
        self.evpFile         = None
        self.userTaskName    = ''
        # message buffer
        self.msgBuffer       = []
        self.lineLimit       = 100
        # JEDI
        self.jediTaskID      = None


    # main
    def run(self):
        try:
            self.putLog('start %s' % self.evpFileName)            
            # lock evp file
            self.evpFile = open(self.evpFileName)
            try:
                fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_EX|fcntl.LOCK_NB)
            except:
                # relase
                self.putLog("cannot lock %s" % self.evpFileName)
                self.evpFile.close()
                return True
            # options
            runEvtList          = []
            eventPickDataType   = ''
            eventPickStreamName = ''
            eventPickDS         = []
            eventPickAmiTag     = ''
            eventPickNumSites   = 1
            inputFileList       = []
            tagDsList           = []
            tagQuery            = ''
            tagStreamRef        = ''
            skipDaTRI           = False
            runEvtGuidMap       = {}
            ei_api              = ''
            # read evp file
            for tmpLine in self.evpFile:
                tmpMatch = re.search('^([^=]+)=(.+)$',tmpLine)
                # check format
                if tmpMatch == None:
                    continue
                tmpItems = tmpMatch.groups()
                if tmpItems[0] == 'runEvent':
                    # get run and event number
                    tmpRunEvt = tmpItems[1].split(',')
                    if len(tmpRunEvt) == 2:
                        runEvtList.append(tmpRunEvt)
                elif tmpItems[0] == 'eventPickDataType':
                    # data type
                    eventPickDataType = tmpItems[1]
                elif tmpItems[0] == 'eventPickStreamName':
                    # stream name
                    eventPickStreamName = tmpItems[1]
                elif tmpItems[0] == 'eventPickDS':
                    # dataset pattern
                    eventPickDS = tmpItems[1].split(',')
                elif tmpItems[0] == 'eventPickAmiTag':
                    # AMI tag
                    eventPickAmiTag = tmpItems[1]
                elif tmpItems[0] == 'eventPickNumSites':
                    # the number of sites where datasets are distributed
                    try:
                        eventPickNumSites = int(tmpItems[1])
                    except:
                        pass
                elif tmpItems[0] == 'userName':
                    # user name
                    self.userDN = tmpItems[1]
                    self.putLog("user=%s" % self.userDN)
                elif tmpItems[0] == 'userTaskName':
                    # user task name
                    self.userTaskName = tmpItems[1]
                elif tmpItems[0] == 'userDatasetName':
                    # user dataset name
                    self.userDatasetName = tmpItems[1]
                elif tmpItems[0] == 'lockedBy':
                    # client name
                    self.lockedBy = tmpItems[1]
                elif tmpItems[0] == 'creationTime':
                    # creation time
                    self.creationTime = tmpItems[1]
                elif tmpItems[0] == 'params':
                    # parameters
                    self.params = tmpItems[1]
                elif tmpItems[0] == 'ei_api':
                    # ei api parameter for MC
                    ei_api = tmpItems[1]
                elif tmpItems[0] == 'inputFileList':
                    # input file list
                    inputFileList = tmpItems[1].split(',')
                    try:
                        inputFileList.remove('')
                    except:
                        pass
                elif tmpItems[0] == 'tagDS':
                    # TAG dataset
                    tagDsList = tmpItems[1].split(',')
                elif tmpItems[0] == 'tagQuery':
                    # query for TAG
                    tagQuery = tmpItems[1]
                elif tmpItems[0] == 'tagStreamRef':
                    # StreamRef for TAG
                    tagStreamRef = tmpItems[1]
                    if not tagStreamRef.endswith('_ref'):
                        tagStreamRef += '_ref'
                elif tmpItems[0] == 'runEvtGuidMap':
                    # GUIDs
                    try:
                        exec "runEvtGuidMap="+tmpItems[1]
                    except:
                        pass
            # extract task name
            if self.userTaskName == '' and self.params != '':
                try:
                    tmpMatch = re.search('--outDS(=| ) *([^ ]+)',self.params)
                    if tmpMatch != None:
                        self.userTaskName = tmpMatch.group(2)
                        if not self.userTaskName.endswith('/'):
                            self.userTaskName += '/'
                except:
                    pass
            # suppress DaTRI
            if self.params != '':
                if '--eventPickSkipDaTRI' in self.params:
                    skipDaTRI = True
            # get compact user name
            compactDN = self.taskBuffer.cleanUserID(self.userDN)
            # get jediTaskID
            self.jediTaskID = self.taskBuffer.getTaskIDwithTaskNameJEDI(compactDN,self.userTaskName)
            # convert 
            if tagDsList == [] or tagQuery == '':
                # convert run/event list to dataset/file list
                tmpRet,locationMap,allFiles = self.pd2p.convertEvtRunToDatasets(runEvtList,
                                                                                eventPickDataType,
                                                                                eventPickStreamName,
                                                                                eventPickDS,
                                                                                eventPickAmiTag,
                                                                                self.userDN,
                                                                                runEvtGuidMap,
                                                                                ei_api
                                                                                )
                if not tmpRet:
                    if 'isFatal' in locationMap and locationMap['isFatal'] == True:
                        self.ignoreError = False
                    self.endWithError('Failed to convert the run/event list to a dataset/file list')
                    return False
            else:
                # get parent dataset/files with TAG
                tmpRet,locationMap,allFiles = self.pd2p.getTagParentInfoUsingTagQuery(tagDsList,tagQuery,tagStreamRef) 
                if not tmpRet:
                    self.endWithError('Failed to get parent dataset/file list with TAG')
                    return False
            # use only files in the list
            if inputFileList != []:
                tmpAllFiles = []
                for tmpFile in allFiles:
                    if tmpFile['lfn'] in inputFileList:
                        tmpAllFiles.append(tmpFile)
                allFiles = tmpAllFiles        
            # remove redundant CN from DN
            tmpDN = self.userDN
            tmpDN = re.sub('/CN=limited proxy','',tmpDN)
            tmpDN = re.sub('(/CN=proxy)+$','',tmpDN)
            # make dataset container
            tmpRet = self.pd2p.registerDatasetContainerWithDatasets(self.userDatasetName,allFiles,
                                                                    locationMap,
                                                                    nSites=eventPickNumSites,
                                                                    owner=tmpDN)
            if not tmpRet:
                self.endWithError('Failed to make a dataset container %s' % self.userDatasetName)
                return False
            # skip DaTRI
            if skipDaTRI:
                # successfully terminated
                self.putLog("skip DaTRI")
                # update task
                self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID)
            else:
                # get candidates
                tmpRet,candidateMaps = self.pd2p.getCandidates(self.userDatasetName,checkUsedFile=False,
                                                               useHidden=True)
                if not tmpRet:
                    self.endWithError('Failed to find candidate for destination')
                    return False
                # collect all candidates
                allCandidates = [] 
                for tmpDS,tmpDsVal in candidateMaps.iteritems():
                    for tmpCloud,tmpCloudVal in tmpDsVal.iteritems():
                        for tmpSiteName in tmpCloudVal[0]:
                            if not tmpSiteName in allCandidates:
                                allCandidates.append(tmpSiteName)
                if allCandidates == []:
                    self.endWithError('No candidate for destination')
                    return False
                # get list of dataset (container) names
                if eventPickNumSites > 1:
                    # decompose container to transfer datasets separately
                    tmpRet,tmpOut = self.pd2p.getListDatasetReplicasInContainer(self.userDatasetName) 
                    if not tmpRet:
                        self.endWithError('Failed to get the size of %s' % self.userDatasetName)
                        return False
                    userDatasetNameList = tmpOut.keys()
                else:
                    # transfer container at once
                    userDatasetNameList = [self.userDatasetName]
                # loop over all datasets
                sitesUsed = []
                for tmpUserDatasetName in userDatasetNameList:
                    # get size of dataset container
                    tmpRet,totalInputSize = rucioAPI.getDatasetSize(tmpUserDatasetName)
                    if not tmpRet:
                        self.endWithError('Failed to get the size of %s' % tmpUserDatasetName)
                        return False
                    # run brokerage
                    tmpJob = JobSpec()
                    tmpJob.AtlasRelease = ''
                    self.putLog("run brokerage for %s" % tmpDS)
                    brokerage.broker.schedule([tmpJob],self.taskBuffer,self.siteMapper,True,allCandidates,
                                              True,datasetSize=totalInputSize)
                    if tmpJob.computingSite.startswith('ERROR'):
                        self.endWithError('brokerage failed with %s' % tmpJob.computingSite)
                        return False
                    self.putLog("site -> %s" % tmpJob.computingSite)
                    # send transfer request
                    try:
                        tmpDN = rucioAPI.parse_dn(tmpDN)
                        tmpStatus,userInfo = rucioAPI.finger(tmpDN)
                        if not tmpStatus:
                            raise RuntimeError,'user info not found for {0} with {1}'.format(tmpDN,userInfo)
                        tmpDN = userInfo['nickname']
                        tmpDQ2ID = self.siteMapper.getSite(tmpJob.computingSite).ddm_input
                        tmpMsg = "%s ds=%s site=%s id=%s" % ('registerDatasetLocation for DaTRI ',
                                                             tmpUserDatasetName,
                                                             tmpDQ2ID,
                                                             tmpDN)
                        self.putLog(tmpMsg)
                        rucioAPI.registerDatasetLocation(tmpDS,[tmpDQ2ID],lifetime=14,owner=tmpDN,
                                                         activity="User Subscriptions")
                        self.putLog('OK')
                    except:
                        errType,errValue = sys.exc_info()[:2]
                        tmpStr = 'Failed to send transfer request : %s %s' % (errType,errValue)
                        tmpStr.strip()
                        tmpStr += traceback.format_exc()
                        self.endWithError(tmpStr)
                        return False
                    # list of sites already used
                    sitesUsed.append(tmpJob.computingSite)
                    self.putLog("used %s sites" % len(sitesUsed))
                    # set candidates
                    if len(sitesUsed) >= eventPickNumSites:
                        # reset candidates to limit the number of sites
                        allCandidates = sitesUsed
                        sitesUsed = []
                    else:
                        # remove site
                        allCandidates.remove(tmpJob.computingSite)
                # send email notification for success
                tmpMsg =  'A transfer request was successfully sent to Rucio.\n'
                tmpMsg += 'Your task will get started once transfer is completed.'
                self.sendEmail(True,tmpMsg)
            try:
                # unlock and delete evp file
                fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_UN)
                self.evpFile.close()
                os.remove(self.evpFileName)
            except:
                pass
            # successfully terminated
            self.putLog("end %s" % self.evpFileName)
            return True
        except:
            errType,errValue = sys.exc_info()[:2]
            self.endWithError('Got exception %s:%s %s' % (errType,errValue,traceback.format_exc()))
            return False


    # end with error
    def endWithError(self,message):
        self.putLog(message,'error')
        # unlock evp file
        try:
            fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_UN)
            self.evpFile.close()
            if not self.ignoreError:
                # remove evp file
                os.remove(self.evpFileName)
                # send email notification
                self.sendEmail(False,message)
        except:
            pass
        # upload log
        if self.jediTaskID != None:
            outLog = self.uploadLog()
            self.taskBuffer.updateTaskErrorDialogJEDI(self.jediTaskID,'event picking failed. '+outLog)
            # update task
            if not self.ignoreError:
                self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID,'tobroken')
            self.putLog(outLog)
        self.putLog('end %s' % self.evpFileName)
        

    # put log
    def putLog(self,msg,type='debug'):
        tmpMsg = msg
        if type == 'error':
            self.logger.error(tmpMsg)
        else:
            self.logger.debug(tmpMsg)


    # send email notification
    def sendEmail(self,isSucceeded,message):
        # mail address
        toAdder = Notifier(self.taskBuffer,None,[]).getEmail(self.userDN)
        if toAdder == '':
            self.putLog('cannot find email address for %s' % self.userDN,'error')
            return
        # subject
        mailSubject = "PANDA notification for Event-Picking Request"
        # message
        mailBody = "Hello,\n\nHere is your request status for event picking\n\n"
        if isSucceeded:
            mailBody += "Status  : Passed to Rucio\n"
        else:
            mailBody += "Status  : Failed\n"
        mailBody +=     "Created : %s\n" % self.creationTime
        mailBody +=     "Ended   : %s\n" % datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
        mailBody +=     "Dataset : %s\n" % self.userDatasetName
        mailBody +=     "\n"        
        mailBody +=     "Parameters : %s %s\n" % (self.lockedBy,self.params)
        mailBody +=     "\n"
        mailBody +=     "%s\n" % message
        # send
        retVal = MailUtils().send(toAdder,mailSubject,mailBody)
        # return
        return

    
    # upload log
    def uploadLog(self):
        if self.jediTaskID == None:
            return 'cannot find jediTaskID'
        strMsg = self.logger.dumpToString()
        s,o = Client.uploadLog(strMsg,self.jediTaskID)
        if s != 0:
            return "failed to upload log with {0}.".format(s)
        if o.startswith('http'):
            return '<a href="{0}">log</a>'.format(o)
        return o
Пример #12
0
def getFilesFromLRC(files,url,guids=[],storageName=[],terminateWhenFailed=False,getPFN=False,
                    scopeList=[]):
    tmpLog = LogWrapper(_log,None)
    tmpLog.debug('getFilesFromLRC "%s" %s' % (url,str(storageName)))    
    # get PFC
    outSTR = ''
    if url.startswith('mysql://'):
        # from MySQL
        outSTR = _getPFNFromMySQL(files,url)
        # get PFN
        if getPFN:
            outPFN = {}
            # FIXME
            tmpLog.debug('RetPFN:%s ' % str(outPFN))            
            return outPFN
    elif url.startswith('http://'):
        # from HTTP I/F
        outSTR = _getPoolFileCatalog(files,url)
        # get PFN
        if getPFN:
            outPFN = {}
            try:
                if not outSTR in ['',None]:
                    root  = xml.dom.minidom.parseString(outSTR)
                    fileNodes = root.getElementsByTagName('File')
                    for file in fileNodes:
                        # get PFN and LFN nodes
                        physical = file.getElementsByTagName('physical')[0]
                        pfnNode  = physical.getElementsByTagName('pfn')[0]
                        logical  = file.getElementsByTagName('logical')[0]
                        lfnNode  = logical.getElementsByTagName('lfn')[0]
                        # convert UTF8 to Raw
                        pfn = str(pfnNode.getAttribute('name'))
                        lfn = str(lfnNode.getAttribute('name'))
                        # assign
                        if not outPFN.has_key(lfn):
                            outPFN[lfn] = []
                        outPFN[lfn].append(pfn)
            except:
                type, value, traceBack = sys.exc_info()
                tmpLog.error(outSTR)
                tmpLog.error("could not parse XML - %s %s" % (type, value))
            tmpLog.debug('RetPFN:%s ' % str(outPFN))                
            return outPFN
    elif url.startswith('lfc://') or url.startswith('rucio://'):
        # from LFC
        timeStart = datetime.datetime.utcnow()
        outSTR = _getPFNFromLFC(files,url,guids,storageName,scopeList=scopeList,tmpLog=tmpLog)
        regTime = datetime.datetime.utcnow() - timeStart
        tmpLog.debug('file lookup for %s LFNs from %s took %s.%03d sec' % (len(files),url,regTime.seconds,
                                                                           regTime.microseconds/1000))
        # get PFN
        if getPFN:
            outPFN = {}
            try:
                if not outSTR in ['',None]:
                    tmpItems = outSTR.split('LFCRet :')
                    tmpItems.remove('')
                    # loop over all returns
                    for tmpItem in tmpItems:
                        exec "tmpLFNmap = %s" % tmpItem
                        for tmpLFN,tmpPFN in tmpLFNmap.iteritems():
                            outPFN[tmpLFN] = tmpPFN
            except:
                type, value, traceBack = sys.exc_info()
                tmpLog.error(outSTR)
                tmpLog.error("could not parse LFC ret - %s %s" % (type, value))
            tmpLog.debug('RetPFN:%s files' % len(outPFN))
            return outPFN
    # check return
    if not isinstance(outSTR,types.StringType):
        if terminateWhenFailed:
            return None
        # set empty string
        outSTR = ''
    # collect OK Files
    okFiles = []
    for file in files:
        if re.search(file,outSTR) != None:
            okFiles.append(file)
    tmpLog.debug('Ret:%s / %s files' % (str(okFiles[:3]),len(okFiles)))
    return okFiles
 def __init__(self,job,datasets,log):
     self.jobSpec = job
     self.datasets = datasets
     self.tmpLog = LogWrapper(log,"{0} CloserAtlasPlugin".format(self.jobSpec.PandaID))
Пример #14
0
 def run(self):
     try:
         # make a message instance
         tmpLog = LogWrapper(_logger, None)
         # run main procedure in the same process
         if not self.forkRun:
             tmpLog.debug('main start')
             tmpLog.debug('firstSubmission={0}'.format(
                 self.firstSubmission))
             # group jobs per VO
             voJobsMap = {}
             ddmFreeJobs = []
             tmpLog.debug('{0} jobs in total'.format(len(self.jobs)))
             for tmpJob in self.jobs:
                 # set VO=local for DDM free
                 if tmpJob.destinationSE == 'local':
                     tmpVO = 'local'
                 else:
                     tmpVO = tmpJob.VO
                 # make map
                 if not voJobsMap.has_key(tmpVO):
                     voJobsMap[tmpVO] = []
                 voJobsMap[tmpVO].append(tmpJob)
             # loop over all VOs
             for tmpVO, tmpJobList in voJobsMap.iteritems():
                 tmpLog.debug('vo={0} has {1} jobs'.format(
                     tmpVO, len(tmpJobList)))
                 # get plugin
                 setupperPluginClass = panda_config.getPlugin(
                     'setupper_plugins', tmpVO)
                 if setupperPluginClass == None:
                     # use ATLAS plug-in by default
                     from SetupperAtlasPlugin import SetupperAtlasPlugin
                     setupperPluginClass = SetupperAtlasPlugin
                 tmpLog.debug('plugin name -> {0}'.format(
                     setupperPluginClass.__name__))
                 try:
                     # make plugin
                     setupperPlugin = setupperPluginClass(
                         self.taskBuffer,
                         self.jobs,
                         tmpLog,
                         resubmit=self.resubmit,
                         pandaDDM=self.pandaDDM,
                         ddmAttempt=self.ddmAttempt,
                         onlyTA=self.onlyTA,
                         firstSubmission=self.firstSubmission)
                     # run plugin
                     tmpLog.debug('run plugin')
                     setupperPlugin.run()
                     # go forward if not TA
                     if not self.onlyTA:
                         # update jobs
                         tmpLog.debug('update jobs')
                         self.updateJobs(
                             setupperPlugin.jobs + setupperPlugin.jumboJobs,
                             tmpLog)
                         # execute post process
                         tmpLog.debug('post execute plugin')
                         setupperPlugin.postRun()
                     tmpLog.debug('done plugin')
                 except:
                     errtype, errvalue = sys.exc_info()[:2]
                     tmpLog.error('plugin failed with {0}:{1}'.format(
                         errtype, errvalue))
             tmpLog.debug('main end')
         else:
             tmpLog.debug('fork start')
             # write jobs to file
             import os
             import cPickle as pickle
             outFileName = '%s/set.%s_%s' % (panda_config.logdir,
                                             self.jobs[0].PandaID,
                                             commands.getoutput('uuidgen'))
             outFile = open(outFileName, 'w')
             pickle.dump(self.jobs, outFile)
             outFile.close()
             # run main procedure in another process because python doesn't release memory
             com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (
                 panda_config.home_dir_cwd, panda_config.home_dir_cwd)
             com += 'source %s; ' % panda_config.glite_source
             com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \
                    (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python,
                     panda_config.pandaPython_dir,outFileName)
             if self.onlyTA:
                 com += " -t"
             if not self.firstSubmission:
                 com += " -f"
             tmpLog.debug(com)
             # exeute
             status, output = self.taskBuffer.processLimiter.getstatusoutput(
                 com)
             tmpLog.debug("return from main process: %s %s" %
                          (status, output))
             tmpLog.debug('fork end')
     except:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('master failed with {0}:{1}'.format(
             errtype, errvalue))
Пример #15
0
 def getGUIDsFromEventIndex(self, runEventList, streamName, amiTags,
                            dataType):
     comment = ' /* DBProxy.getGUIDsFromEventIndex */'
     methodName = comment.split(' ')[-2].split('.')[-1]
     tmpLog = LogWrapper(
         _logger,
         methodName + " <streamName={0} amiTags={1} dataType={2}>".format(
             streamName, amiTags, dataType))
     try:
         # change to list
         if not amiTags in [None, '']:
             amiTags = amiTags.replace('*', '.*').split(',')
         tmpLog.debug("start for {0} events".format(len(runEventList)))
         # check data type
         if not dataType in ['RAW', 'ESD', 'AOD']:
             return False, 'dataType={0} is unsupported'.format(dataType)
         # sql to insert runs and events
         sqlRE = "INSERT INTO {0}.TMP_RUN_EVENT_PAIRS (runNumber,eventNumber) ".format(
             panda_config.schemaEI)
         sqlRE += "VALUES (:runNumber,:eventNumber) "
         varMaps = []
         for runNumber, eventNumber in runEventList:
             varMap = {}
             varMap[':runNumber'] = runNumber
             varMap[':eventNumber'] = eventNumber
             varMaps.append(varMap)
         # begin transaction
         self.conn.begin()
         self.cur.arraysize = 100000
         # insert runs and events
         self.cur.executemany(sqlRE + comment, varMaps)
         # read GUIDs
         varMap = {}
         if amiTags in [None, '']:
             sqlRG = "SELECT runNumber,eventNumber,guid_{0} ".format(
                 dataType)
             sqlRG += "FROM {0}.V_PANDA_EVPICK_NOAMITAG_MANY ".format(
                 panda_config.schemaEI)
         else:
             sqlRG = "SELECT runNumber,eventNumber,guid_{0},amiTag ".format(
                 dataType)
             sqlRG += "FROM {0}.V_PANDA_EVPICK_AMITAG_MANY ".format(
                 panda_config.schemaEI)
         if not streamName in [None, '']:
             sqlRG += "WHERE streamName=:streamName "
             varMap[':streamName'] = streamName
         self.cur.execute(sqlRG + comment, varMap)
         resRG = self.cur.fetchall()
         # commit
         if not self._commit():
             raise RuntimeError, 'Commit error'
         retValue = {}
         keyAmiIdxMap = {}
         for tmpItem in resRG:
             if amiTags in [None, '']:
                 runNumber, eventNumber, guid = tmpItem
                 # dummy
                 idxTag = 0
             else:
                 runNumber, eventNumber, guid, amiTag = tmpItem
                 # get index number for the AMI tag in the list
                 idxTag = self.getIndexAmiTag(amiTags, amiTag)
                 # didn't match
                 if idxTag == None:
                     continue
             tmpKey = (runNumber, eventNumber)
             # use AMI tag in a preference orde
             if tmpKey in keyAmiIdxMap and keyAmiIdxMap[tmpKey] < idxTag:
                 continue
             keyAmiIdxMap[tmpKey] = idxTag
             retValue[tmpKey] = [guid]
         tmpLog.debug("found {0} events".format(len(retValue)))
         return True, retValue
     except:
         # roll back
         self._rollback()
         # error
         self.dumpErrorMessage(_logger, methodName)
         return False, None
Пример #16
0
import sys
import datetime
import traceback
from taskbuffer.TaskBuffer import taskBuffer
from pandalogger.PandaLogger import PandaLogger
from pandalogger.LogWrapper import LogWrapper
from brokerage.SiteMapper import SiteMapper
from taskbuffer import ErrorCode

# password
from config import panda_config
passwd = panda_config.dbpasswd

# logger
_logger = PandaLogger().getLogger('esPreemption')
tmpLog = LogWrapper(_logger)

tmpLog.debug("================= start ==================")

# instantiate TB
taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)

# instantiate sitemapper
siteMapper = SiteMapper(taskBuffer)

# time limit
timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=15)

# get low priority ES jobs per site
sqlEsJobs = "SELECT PandaID,computingSite,commandToPilot,startTime "
sqlEsJobs += "FROM {0}.jobsActive4 ".format(panda_config.schemaPANDA)
Пример #17
0
class AdderGen:
    # constructor
    def __init__(self,taskBuffer,jobID,jobStatus,xmlFile,ignoreTmpError=True,siteMapper=None):
        self.job = None
        self.jobID = jobID
        self.jobStatus = jobStatus
        self.taskBuffer = taskBuffer
        self.ignoreTmpError = ignoreTmpError
        self.lockXML = None
        self.siteMapper = siteMapper
        self.attemptNr = None
        self.xmlFile = xmlFile
        self.datasetMap = {}
        self.extraInfo = {'surl':{},'nevents':{},'lbnr':{}}
        # exstract attemptNr
        try:
            tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1]
            if re.search('^\d+$',tmpAttemptNr) != None:
                self.attemptNr = int(tmpAttemptNr)
        except:
            pass
        # logger
        self.logger = LogWrapper(_logger,self.jobID)
        
    # main
    def run(self):
        try:
            self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus,self.attemptNr))
            # lock XML
            self.lockXML = open(self.xmlFile)
            try:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
            except:
                self.logger.debug("cannot get lock : %s" % self.xmlFile)
                self.lockXML.close()
                # remove XML just in case for the final attempt
                if not self.ignoreTmpError:
                    try:
                        # remove Catalog
                        os.remove(self.xmlFile)
                    except:
                        pass
                return
            # check if file exists
            if not os.path.exists(self.xmlFile):
                self.logger.debug("not exist : %s" % self.xmlFile)
                try:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                    self.lockXML.close()
                except:
                    pass
                return
            # query job
            self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False,
                                                fromArchived=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
            # check if job has finished
            if self.job == None:
                self.logger.debug(': job not found in DB')
            elif self.job.jobStatus in ['finished','failed','unknown','cancelled','merging']:
                self.logger.error(': invalid state -> %s' % self.job.jobStatus)
            elif self.attemptNr != None and self.job.attemptNr != self.attemptNr:
                self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr,self.attemptNr))
            else:
                # check file status in JEDI
                fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(self.job)
                self.logger.debug("check file status in JEDI : {0}".format(fileCheckInJEDI))                
                if fileCheckInJEDI == None:
                    raise RuntimeError,'failed to check file status in JEDI'
                if fileCheckInJEDI == False:
                    # set job status to failed since some file status is wrong in JEDI 
                    self.jobStatus = 'failed'
                    self.job.ddmErrorCode = ErrorCode.EC_Adder
                    self.job.ddmErrorDiag = "wrong file status in JEDI"
                    self.logger.debug("set jobStatus={0} since input are already cancelled in JEDI".format(self.jobStatus))
                # keep old status
                oldJobStatus = self.job.jobStatus
                # set job status
                if not self.job.jobStatus in ['transferring']:
                    self.job.jobStatus = self.jobStatus
                addResult = None
                adderPlugin = None
                # parse XML
                parseResult = self.parseXML()
                if parseResult < 2:
                    # intraction with DDM
                    try:
                        # set VO=local for DDM free
                        if self.job.destinationSE == 'local':
                            tmpVO = 'local'
                        else:
                            tmpVO = self.job.VO
                        # instantiate concrete plugin
                        adderPluginClass = panda_config.getPlugin('adder_plugins',tmpVO)
                        if adderPluginClass == None:
                            # use ATLAS plugin by default
                            from AdderAtlasPlugin import AdderAtlasPlugin
                            adderPluginClass = AdderAtlasPlugin
                        self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__))
                        adderPlugin = adderPluginClass(self.job,
                                                       taskBuffer=self.taskBuffer,
                                                       siteMapper=self.siteMapper,
                                                       extraInfo=self.extraInfo,
                                                       logger=self.logger)
                        # execute
                        self.logger.debug('plugin is ready')
                        adderPlugin.execute()
                        addResult = adderPlugin.result
                        self.logger.debug('plugin done with %s' % (addResult.statusCode))
                    except:
                        errtype,errvalue = sys.exc_info()[:2]
                        self.logger.error("failed to execute AdderPlugin for VO={0} with {1}:{2}".format(tmpVO,
                                                                                                         errtype,
                                                                                                         errvalue)) 
                        addResult = None
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "AdderPlugin failure"
                    # ignore temporary errors
                    if self.ignoreTmpError and addResult != None and addResult.isTemporary():
                        self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag)
                        self.logger.debug('escape')
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type,value))
                            self.logger.debug("cannot unlock XML")
                        return
                    # failed
                    if addResult == None or not addResult.isSucceeded():
                        self.job.jobStatus = 'failed'
                # set file status for failed jobs or failed transferring jobs
                if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                    self.job.jobStatus = 'failed'
                    for file in self.job.Files:
                        if file.type in ['output','log']:
                            if addResult != None and file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                            else:
                                file.status = 'failed'
                else:
                    # reset errors
                    self.job.jobDispatcherErrorCode = 0
                    self.job.jobDispatcherErrorDiag = 'NULL'
                    # set status
                    if addResult != None and addResult.mergingFiles != []:
                        # set status for merging:                        
                        for file in self.job.Files:
                            if file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                        self.job.jobStatus = 'merging'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                    elif addResult != None and addResult.transferringFiles != []:
                        # set status for transferring
                        for file in self.job.Files:
                            if file.lfn in addResult.transferringFiles:
                                file.status = 'transferring'
                        self.job.jobStatus = 'transferring'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                    else:
                        self.job.jobStatus = 'finished'
                # endtime
                if self.job.endTime=='NULL':
                    self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                # output size and # of outputs
                self.job.nOutputDataFiles = 0
                self.job.outputFileBytes = 0
                for tmpFile in self.job.Files:
                    if tmpFile.type == 'output':
                        self.job.nOutputDataFiles += 1
                        try:
                            self.job.outputFileBytes += tmpFile.fsize
                        except:
                            pass
                # protection
                maxOutputFileBytes = 99999999999
                if self.job.outputFileBytes > maxOutputFileBytes:
                    self.job.outputFileBytes = maxOutputFileBytes
                # set cancelled state
                if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                    self.job.jobStatus = 'cancelled'
                # update job
                self.logger.debug("updating DB")
                retU = self.taskBuffer.updateJobs([self.job],False,oldJobStatusList=[oldJobStatus],
                                                  extraInfo=self.extraInfo)
                self.logger.debug("retU: %s" % retU)
                # failed
                if not retU[0]:
                    self.logger.error('failed to update DB')
                    # unlock XML
                    try:
                        fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                        self.lockXML.close()                            
                    except:
                        type, value, traceBack = sys.exc_info()
                        self.logger.debug(": %s %s" % (type,value))
                        self.logger.debug("cannot unlock XML")
                    return
                # setup for closer
                if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.jobStatus == 'cancelled'):
                    destDBList = []
                    guidList = []
                    for file in self.job.Files:
                        # ignore inputs
                        if file.type == 'input':
                            continue
                        # skip pseudo datasets
                        if file.destinationDBlock in ['',None,'NULL']:
                            continue
                        # start closer for output/log datasets
                        if not file.destinationDBlock in destDBList:
                            destDBList.append(file.destinationDBlock)
                        # collect GUIDs
                        if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \
                                                                  self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                                  and file.type == 'output':
                            # extract base LFN since LFN was changed to full LFN for CMS
                            baseLFN = file.lfn.split('/')[-1]
                            guidList.append({'lfn':baseLFN,'guid':file.GUID,'type':file.type,
                                             'checksum':file.checksum,'md5sum':file.md5sum,
                                             'fsize':file.fsize,'scope':file.scope})
                    if guidList != []:
                        retG = self.taskBuffer.setGUIDs(guidList)
                    if destDBList != []:
                        # start Closer
                        if adderPlugin != None and hasattr(adderPlugin,'datasetMap') and adderPlugin.datasetMap != {}:
                            cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,datasetMap=adderPlugin.datasetMap)
                        else:
                            cThr = Closer.Closer(self.taskBuffer,destDBList,self.job)
                        self.logger.debug("start Closer")
                        cThr.start()
                        cThr.join()
                        self.logger.debug("end Closer")
            self.logger.debug("end")
            try:
                # remove Catalog
                os.remove(self.xmlFile)
            except:
                pass
            # unlock XML
            if self.lockXML != None:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                self.lockXML.close()            
        except:
            type, value, traceBack = sys.exc_info()
            self.logger.debug(": %s %s" % (type,value))
            self.logger.debug("except")
            # unlock XML just in case
            try:
                if self.lockXML != None:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
            except:
                type, value, traceBack = sys.exc_info()
                self.logger.debug(": %s %s" % (type,value))
                self.logger.debug("cannot unlock XML")


    # parse XML
    # 0: succeeded, 1: harmless error to exit, 2: fatal error, 3: event service
    def parseXML(self):
        # get LFN and GUID
        self.logger.debug('XML filename : %s' % self.xmlFile)
        # no outputs
        if self.job.Files == []:
            self.logger.debug("has no outputs")
            self.logger.debug("parseXML end")
            return 0
        # get input files
        inputLFNs = []
        for file in self.job.Files:
            if file.type == 'input':
                inputLFNs.append(file.lfn)
        # parse XML
        lfns    = []
        guids   = []
        fsizes  = []
        md5sums = []
        chksums = []
        surls   = []
        fullLfnMap = {}
        nEventsMap = {}
        try:
            root  = xml.dom.minidom.parse(self.xmlFile)
            files = root.getElementsByTagName('File')
            for file in files:
                # get GUID
                guid = str(file.getAttribute('ID'))
                # get PFN and LFN nodes
                logical  = file.getElementsByTagName('logical')[0]
                lfnNode  = logical.getElementsByTagName('lfn')[0]
                # convert UTF8 to Raw
                lfn = str(lfnNode.getAttribute('name'))
                # get metadata
                fsize   = None
                md5sum  = None
                adler32 = None
                surl    = None
                fullLFN = None
                for meta in file.getElementsByTagName('metadata'):
                    # get fsize
                    name = str(meta.getAttribute('att_name'))
                    if name == 'fsize':
                        fsize = long(meta.getAttribute('att_value'))
                    elif name == 'md5sum':
                        md5sum = str(meta.getAttribute('att_value'))
                        # check
                        if re.search("^[a-fA-F0-9]{32}$",md5sum) == None:
                            md5sum = None
                    elif name == 'adler32':
                        adler32 = str(meta.getAttribute('att_value'))
                    elif name == 'surl':
                        surl = str(meta.getAttribute('att_value'))
                    elif name == 'full_lfn':
                        fullLFN = str(meta.getAttribute('att_value'))
                # error check
                if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)):
                    if EventServiceUtils.isEventServiceMerge(self.job):
                        continue
                    else:
                        raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
                # append
                lfns.append(lfn)
                guids.append(guid)
                fsizes.append(fsize)
                md5sums.append(md5sum)
                surls.append(surl)
                if adler32 != None:
                    # use adler32 if available
                    chksums.append("ad:%s" % adler32)
                else:
                    chksums.append("md5:%s" % md5sum)
                if fullLFN != None:
                    fullLfnMap[lfn] = fullLFN
        except:
            # check if file exists
            if os.path.exists(self.xmlFile):
                type, value, traceBack = sys.exc_info()
                self.logger.error(": %s %s" % (type,value))
                # set failed anyway
                self.job.jobStatus = 'failed'
                # XML error happens when pilot got killed due to wall-time limit or failures in wrapper
                if (self.job.pilotErrorCode in [0,'0','NULL']) and \
                   (self.job.transExitCode  in [0,'0','NULL']):
                    self.job.ddmErrorCode = ErrorCode.EC_Adder
                    self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML"
                return 2
            else:
                # XML was deleted
                return 1
        # parse metadata to get nEvents
        try:
            root  = xml.dom.minidom.parseString(self.job.metadata)
            files = root.getElementsByTagName('File')
            for file in files:
                # get GUID
                guid = str(file.getAttribute('ID'))
                # get PFN and LFN nodes
                logical  = file.getElementsByTagName('logical')[0]
                lfnNode  = logical.getElementsByTagName('lfn')[0]
                # convert UTF8 to Raw
                lfn = str(lfnNode.getAttribute('name'))
                # get metadata
                nevents = None
                for meta in file.getElementsByTagName('metadata'):
                    # get fsize
                    name = str(meta.getAttribute('att_name'))
                    if name == 'events':
                        nevents = long(meta.getAttribute('att_value'))
                        nEventsMap[lfn] = nevents
                        break
        except:
            pass
        self.logger.debug('nEventsMap=%s' % str(nEventsMap))
        # get lumi block number
        lumiBlockNr = self.job.getLumiBlockNr()
        # check files
        fileList = []
        for file in self.job.Files:
            fileList.append(file.lfn)
            if file.type == 'input':
                if file.lfn in lfns:
                    if self.job.prodSourceLabel in ['user','panda']:
                        # skipped file
                        file.status = 'skipped'
                    elif self.job.prodSourceLabel in ['managed','test','rc_test','ptest']:
                        # failed by pilot
                        file.status = 'failed'
            elif file.type == 'output' or file.type == 'log':
                # add only log file for failed jobs
                if self.jobStatus == 'failed' and file.type != 'log':
                    file.status = 'failed'
                    continue
                # set failed if it is missing in XML
                if not file.lfn in lfns:
                    if self.job.jobStatus == 'finished' and EventServiceUtils.isEventServiceJob(self.job):
                        # unset file status for ES jobs
                        pass
                    else:
                        file.status = 'failed'
                    continue
                # look for GUID with LFN
                try:
                    i = lfns.index(file.lfn)
                    file.GUID   = guids[i]
                    file.fsize  = fsizes[i]
                    file.md5sum = md5sums[i]
                    file.checksum = chksums[i]
                    surl = surls[i]
                    # status
                    file.status = 'ready'
                    # change to full LFN
                    if fullLfnMap.has_key(file.lfn):
                        file.lfn = fullLfnMap[file.lfn]
                    # add SURL to extraInfo
                    self.extraInfo['surl'][file.lfn] = surl
                    # add nevents 
                    if nEventsMap.has_key(file.lfn):
                        self.extraInfo['nevents'][file.lfn] = nEventsMap[file.lfn]
                except:
                    # status
                    file.status = 'failed'
                    type, value, traceBack = sys.exc_info()
                    self.logger.error(": %s %s" % (type,value))
                # set lumi block number
                if lumiBlockNr != None and file.status != 'failed':
                    self.extraInfo['lbnr'][file.lfn] = lumiBlockNr 
        # check consistency between XML and filesTable
        for lfn in lfns:
            if not lfn in fileList:
                self.logger.error("%s is not found in filesTable" % lfn)
                self.job.jobStatus = 'failed'
                for tmpFile in self.job.Files:
                    tmpFile.status = 'failed'
                self.job.ddmErrorCode = ErrorCode.EC_Adder
                self.job.ddmErrorDiag = "pilot XML is inconsistent with filesTable"
                return 2
        # return
        self.logger.debug("parseXML end")
        return 0
Пример #18
0
import datetime
import commands

from taskbuffer.TaskBuffer import taskBuffer
from taskbuffer.WorkerSpec import WorkerSpec
from pandalogger.PandaLogger import PandaLogger
from pandalogger.LogWrapper import LogWrapper

# password
from config import panda_config
passwd = panda_config.dbpasswd

# logger
_logger = PandaLogger().getLogger('harvesterCtl')

tmpLog = LogWrapper(_logger,None)

tmpLog.debug("===================== start =====================")

# overall timeout value
overallTimeout = 20

# grace period
try:
    gracePeriod = int(sys.argv[1])
except:
    gracePeriod = 3

# kill old process
try:
    # time limit
Пример #19
0
import datetime
import commands

from taskbuffer.TaskBuffer import taskBuffer
from taskbuffer.WorkerSpec import WorkerSpec
from pandalogger.PandaLogger import PandaLogger
from pandalogger.LogWrapper import LogWrapper

# password
from config import panda_config
passwd = panda_config.dbpasswd

# logger
_logger = PandaLogger().getLogger('harvesterCtl')

tmpLog = LogWrapper(_logger, None)

tmpLog.debug("===================== start =====================")

# overall timeout value
overallTimeout = 20

# grace period
try:
    gracePeriod = int(sys.argv[1])
except:
    gracePeriod = 3

# kill old process
try:
    # time limit
Пример #20
0
def _getPFNFromLFC(lfns,
                   dq2url,
                   guids,
                   storageName,
                   scopeList=[],
                   tmpLog=None):
    if tmpLog == None:
        tmpLog = LogWrapper(_log, logPrefix)
    tmpLog.debug('_getPFNFromLFC %s %s / %s LFNs:%s %s' %
                 (dq2url, str(storageName), len(lfns), str(
                     lfns[:3]), str(scopeList[:3])))
    outStr = ''
    # check paramter
    if guids == [] or storageName == [] or (len(lfns) != len(guids)):
        tmpLog.debug('_getPFNFromLFC done with empty list')
        return outStr
    # check scopeList
    if not scopeList in [None, []] and len(lfns) != len(scopeList):
        tmpLog.warning('_getPFNFromLFC wrong scopeList %s %s %s %s' %
                       (dq2url, str(storageName), str(lfns), str(scopeList)))
        tmpLog.error('_getPFNFromLFC failed')
        return outStr
    # loop over all LFNs
    iLFN = 0
    nLFN = 1000
    strFiles = ''
    outStr = ''
    for iLFN in range(len(lfns)):
        if scopeList != []:
            strFiles += '%s %s %s\n' % (lfns[iLFN], guids[iLFN],
                                        scopeList[iLFN])
        else:
            strFiles += '%s %s\n' % (lfns[iLFN], guids[iLFN])
        # bulk operation
        if (iLFN + 1) % nLFN == 0 or (iLFN + 1) >= len(lfns):
            # write to file
            inFileName = '%s/lfcin.%s' % (panda_config.logdir,
                                          commands.getoutput('uuidgen'))
            ifile = open(inFileName, 'w')
            ifile.write(strFiles)
            ifile.close()
            # construct commands
            strStorage = ''
            for storage in storageName:
                strStorage += '%s,' % storage
            strStorage = strStorage[:-1]
            com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (
                panda_config.home_dir_cwd, panda_config.home_dir_cwd)
            com += 'unset LD_LIBRARY_PATH; unset PYTHONPATH; export PATH=/usr/local/bin:/bin:/usr/bin; '
            com+= 'source %s; %s/python -Wignore %s/LFCclient.py -f %s -l %s -s %s' % \
                  (panda_config.glite_source,panda_config.native_python32,panda_config.lfcClient_dir,
                   inFileName,dq2url,strStorage)
            tmpLog.debug(com)
            # exeute
            status, output = commands.getstatusoutput(com)
            tmpLog.debug(status)
            if status == 0:
                outStr += output
            else:
                tmpLog.error("_getPFNFromLFC : %s %s %s" %
                             (dq2url, status, output))
                # send message to logger
                try:
                    # make message
                    message = 'LFC access : %s %s %s' % (dq2url, status,
                                                         output)
                    # get logger
                    _pandaLogger = PandaLogger()
                    _pandaLogger.lock()
                    _pandaLogger.setParams({'Type': 'broker_util'})
                    logger = _pandaLogger.getHttpLogger(
                        panda_config.loggername)
                    # add message
                    logger.error(message)
                    # release HTTP handler
                    _pandaLogger.release()
                except:
                    pass
                tmpLog.error('_getPFNFromLFC failed')
                return status
            # reset
            strFiles = ''
    tmpLog.debug('_getPFNFromLFC done')
    # return
    return outStr
import re
from config import panda_config

from taskbuffer.TaskBuffer import taskBuffer

from pandalogger.PandaLogger import PandaLogger
from pandalogger.LogWrapper import LogWrapper

import panda_proxy_cache

# logger
_logger = PandaLogger().getLogger('panda_activeusers_query')
tmpLog = LogWrapper(_logger)


if __name__ == '__main__' :

    tmpLog.debug("================= start ==================")
    # instantiate TB
    taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
    
    # instantiate MyProxy I/F
    my_proxy_interface_instance = panda_proxy_cache.MyProxyInterface()

    # roles
    if hasattr(panda_config,'proxy_cache_roles'):
        roles = panda_config.proxy_cache_roles.split(',')
    else:
        roles = ['atlas','atlas:/atlas/Role=production','atlas:/atlas/Role=pilot']
    # get users
    sql = 'select distinct DN FROM ATLAS_PANDAMETA.users WHERE GRIDPREF LIKE :patt'
Пример #22
0
def updateJob(req,
              jobId,
              state,
              token=None,
              transExitCode=None,
              pilotErrorCode=None,
              pilotErrorDiag=None,
              timestamp=None,
              timeout=60,
              xml='',
              node=None,
              workdir=None,
              cpuConsumptionTime=None,
              cpuConsumptionUnit=None,
              remainingSpace=None,
              schedulerID=None,
              pilotID=None,
              siteName=None,
              messageLevel=None,
              pilotLog='',
              metaData='',
              cpuConversionFactor=None,
              exeErrorCode=None,
              exeErrorDiag=None,
              pilotTiming=None,
              computingElement=None,
              startTime=None,
              endTime=None,
              nEvents=None,
              nInputFiles=None,
              batchID=None,
              attemptNr=None,
              jobMetrics=None,
              stdout='',
              jobSubStatus=None,
              coreCount=None,
              maxRSS=None,
              maxVMEM=None,
              maxSWAP=None,
              maxPSS=None,
              avgRSS=None,
              avgVMEM=None,
              avgSWAP=None,
              avgPSS=None,
              totRCHAR=None,
              totWCHAR=None,
              totRBYTES=None,
              totWBYTES=None,
              rateRCHAR=None,
              rateWCHAR=None,
              rateRBYTES=None,
              rateWBYTES=None):
    tmpLog = LogWrapper(
        _logger, 'updateJob PandaID={0} PID={1}'.format(jobId, os.getpid()))
    tmpLog.debug('start')
    # get DN
    realDN = _getDN(req)
    # get FQANs
    fqans = _getFQAN(req)
    # check production role
    prodManager = _checkRole(fqans,
                             realDN,
                             jobDispatcher,
                             site=siteName,
                             hostname=req.get_remote_host())
    # check token
    validToken = _checkToken(token, jobDispatcher)
    # accept json
    acceptJson = req.acceptJson()
    _logger.debug(
        "updateJob(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,attemptNr:%s,jobSubStatus:%s,core:%s,DN:%s,role:%s,token:%s,val:%s,FQAN:%s,maxRSS=%s,maxVMEM=%s,maxSWAP=%s,maxPSS=%s,avgRSS=%s,avgVMEM=%s,avgSWAP=%s,avgPSS=%s,totRCHAR=%s,totWCHAR=%s,totRBYTES=%s,totWBYTES=%s,rateRCHAR=%s,rateWCHAR=%s,rateRBYTES=%s,rateWBYTES=%s\n==XML==\n%s\n==LOG==\n%s\n==Meta==\n%s\n==Metrics==\n%s\n==stdout==\n%s)"
        % (jobId, state, transExitCode, pilotErrorCode, pilotErrorDiag, node,
           workdir, cpuConsumptionTime, cpuConsumptionUnit, remainingSpace,
           schedulerID, pilotID, siteName, messageLevel, nEvents, nInputFiles,
           cpuConversionFactor, exeErrorCode, exeErrorDiag, pilotTiming,
           computingElement, startTime, endTime, batchID, attemptNr,
           jobSubStatus, coreCount, realDN, prodManager, token, validToken,
           str(fqans), maxRSS, maxVMEM, maxSWAP, maxPSS, avgRSS, avgVMEM,
           avgSWAP, avgPSS, totRCHAR, totWCHAR, totRBYTES, totWBYTES,
           rateRCHAR, rateWCHAR, rateRBYTES, rateWBYTES, xml, pilotLog[:1024],
           metaData[:1024], jobMetrics, stdout))
    _pilotReqLogger.debug('method=updateJob,site=%s,node=%s,type=None' %
                          (siteName, node))
    # invalid role
    if not prodManager:
        _logger.warning("updateJob(%s) : invalid role" % jobId)
        if acceptJson:
            tmpMsg = 'no production/pilot role in VOMS FQANs or non pilot owner'
        else:
            tmpMsg = None
        return Protocol.Response(Protocol.SC_Role, tmpMsg).encode(acceptJson)
    # invalid token
    if not validToken:
        _logger.warning("updateJob(%s) : invalid token" % jobId)
        return Protocol.Response(Protocol.SC_Invalid).encode(acceptJson)
    # aborting message
    if jobId == 'NULL':
        return Protocol.Response(Protocol.SC_Success).encode(acceptJson)
    # check status
    if not state in [
            'running', 'failed', 'finished', 'holding', 'starting',
            'transferring'
    ]:
        _logger.warning("invalid state=%s for updateJob" % state)
        return Protocol.Response(Protocol.SC_Success).encode(acceptJson)
    # create parameter map
    param = {}
    if cpuConsumptionTime != None:
        param['cpuConsumptionTime'] = cpuConsumptionTime
    if cpuConsumptionUnit != None:
        param['cpuConsumptionUnit'] = cpuConsumptionUnit
    if node != None:
        param['modificationHost'] = node[:128]
    if transExitCode != None:
        param['transExitCode'] = transExitCode
    if pilotErrorCode != None:
        param['pilotErrorCode'] = pilotErrorCode
    if pilotErrorDiag != None:
        param['pilotErrorDiag'] = pilotErrorDiag[:500]
    if jobMetrics != None:
        param['jobMetrics'] = jobMetrics[:500]
    if schedulerID != None:
        param['schedulerID'] = schedulerID
    if pilotID != None:
        param['pilotID'] = pilotID[:200]
    if batchID != None:
        param['batchID'] = batchID[:80]
    if exeErrorCode != None:
        param['exeErrorCode'] = exeErrorCode
    if exeErrorDiag != None:
        param['exeErrorDiag'] = exeErrorDiag[:500]
    if cpuConversionFactor != None:
        param['cpuConversion'] = cpuConversionFactor
    if pilotTiming != None:
        param['pilotTiming'] = pilotTiming
    if computingElement != None:
        param['computingElement'] = computingElement
    if nEvents != None:
        param['nEvents'] = nEvents
    if nInputFiles != None:
        param['nInputFiles'] = nInputFiles
    if not jobSubStatus in [None, '']:
        param['jobSubStatus'] = jobSubStatus
    if not coreCount in [None, '']:
        param['actualCoreCount'] = coreCount
    if maxRSS != None:
        param['maxRSS'] = maxRSS
    if maxVMEM != None:
        param['maxVMEM'] = maxVMEM
    if maxSWAP != None:
        param['maxSWAP'] = maxSWAP
    if maxPSS != None:
        param['maxPSS'] = maxPSS
    if avgRSS != None:
        param['avgRSS'] = avgRSS
    if avgVMEM != None:
        param['avgVMEM'] = avgVMEM
    if avgSWAP != None:
        param['avgSWAP'] = avgSWAP
    if avgPSS != None:
        param['avgPSS'] = avgPSS
    if totRCHAR is not None:
        totRCHAR = int(totRCHAR) / 1024  # convert to kByte
        totRCHAR = min(10**10 - 1, totRCHAR)  # limit to 10 digit
        param['totRCHAR'] = totRCHAR
    if totWCHAR is not None:
        totWCHAR = int(totWCHAR) / 1024  # convert to kByte
        totWCHAR = min(10**10 - 1, totWCHAR)  # limit to 10 digit
        param['totWCHAR'] = totWCHAR
    if totRBYTES is not None:
        totRBYTES = int(totRBYTES) / 1024  # convert to kByte
        totRBYTES = min(10**10 - 1, totRBYTES)  # limit to 10 digit
        param['totRBYTES'] = totRBYTES
    if totWBYTES is not None:
        totWBYTES = int(totWBYTES) / 1024  # convert to kByte
        totWBYTES = min(10**10 - 1, totWBYTES)  # limit to 10 digit
        param['totWBYTES'] = totWBYTES
    if rateRCHAR is not None:
        rateRCHAR = min(10**10 - 1, int(rateRCHAR))  # limit to 10 digit
        param['rateRCHAR'] = rateRCHAR
    if rateWCHAR is not None:
        rateWCHAR = min(10**10 - 1, int(rateWCHAR))  # limit to 10 digit
        param['rateWCHAR'] = rateWCHAR
    if rateRBYTES is not None:
        rateRBYTES = min(10**10 - 1, int(rateRBYTES))  # limit to 10 digit
        param['rateRBYTES'] = rateRBYTES
    if rateWBYTES is not None:
        rateWBYTES = min(10**10 - 1, int(rateWBYTES))  # limit to 10 digit
        param['rateWBYTES'] = rateWBYTES
    if startTime != None:
        try:
            param['startTime'] = datetime.datetime(
                *time.strptime(startTime, '%Y-%m-%d %H:%M:%S')[:6])
        except:
            pass
    if endTime != None:
        try:
            param['endTime'] = datetime.datetime(
                *time.strptime(endTime, '%Y-%m-%d %H:%M:%S')[:6])
        except:
            pass
    if attemptNr != None:
        try:
            attemptNr = int(attemptNr)
        except:
            attemptNr = None
    if stdout != '':
        stdout = stdout[:2048]
    # invoke JD
    tmpLog.debug('executing')
    return jobDispatcher.updateJob(int(jobId), state, int(timeout), xml,
                                   siteName, param, metaData, pilotLog,
                                   attemptNr, stdout, acceptJson)
Пример #23
0
 def application(environ, start_response):
     # get method name
     methodName = ''
     if environ.has_key('SCRIPT_NAME'):
         methodName = environ['SCRIPT_NAME'].split('/')[-1]
     tmpLog = LogWrapper(_logger,
                         "PID={0} {1}".format(os.getpid(), methodName))
     tmpLog.debug("start")
     regStart = datetime.datetime.utcnow()
     retType = None
     # check method name
     if not methodName in allowedMethods:
         tmpLog.error("is forbidden")
         exeRes = "False : %s is forbidden" % methodName
     else:
         # get method object
         tmpMethod = None
         try:
             exec "tmpMethod = %s" % methodName
         except:
             pass
         # object not found
         if tmpMethod == None:
             tmpLog.error("is undefined")
             exeRes = "False"
         else:
             try:
                 # get params
                 tmpPars = cgi.FieldStorage(environ['wsgi.input'],
                                            environ=environ,
                                            keep_blank_values=1)
                 # convert to map
                 params = {}
                 for tmpKey in tmpPars.keys():
                     if tmpPars[tmpKey].file != None and tmpPars[
                             tmpKey].filename != None:
                         # file
                         params[tmpKey] = tmpPars[tmpKey]
                     else:
                         # string
                         params[tmpKey] = tmpPars.getfirst(tmpKey)
                 if panda_config.entryVerbose:
                     tmpLog.debug("with %s" % str(params.keys()))
                 # dummy request object
                 dummyReq = DummyReq(environ, tmpLog)
                 # exec
                 exeRes = apply(tmpMethod, [dummyReq], params)
                 # extract return type
                 if type(exeRes) == types.DictType:
                     retType = exeRes['type']
                     exeRes = exeRes['content']
                 # convert bool to string
                 if exeRes in [True, False]:
                     exeRes = str(exeRes)
             except Exception as e:
                 tmpLog.error("execution failure : {0}".format(str(e)))
                 errStr = ""
                 for tmpKey, tmpVal in environ.iteritems():
                     errStr += "%s : %s\n" % (tmpKey, str(tmpVal))
                 tmpLog.error(errStr)
                 # return internal server error
                 start_response('500 INTERNAL SERVER ERROR',
                                [('Content-Type', 'text/plain')])
                 return [str(e)]
     if panda_config.entryVerbose:
         tmpLog.debug("done")
     regTime = datetime.datetime.utcnow() - regStart
     tmpLog.info(
         "exec_time=%s.%03d sec, return len=%s B" %
         (regTime.seconds, regTime.microseconds / 1000, len(str(exeRes))))
     # return
     if exeRes == taskbuffer.ErrorCode.EC_NotFound:
         start_response('404 Not Found', [('Content-Type', 'text/plain')])
         return ['not found']
     elif isinstance(exeRes, taskbuffer.ErrorCode.EC_Redirect):
         start_response('302 Redirect', [('Location', exeRes.url)])
         return ['redirect']
     else:
         if retType == 'json':
             start_response('200 OK',
                            [('Content-Type', 'application/json')])
         else:
             start_response('200 OK', [('Content-Type', 'text/plain')])
         return [exeRes]
Пример #24
0
class AdderGen:
    # constructor
    def __init__(self,taskBuffer,jobID,jobStatus,xmlFile,ignoreTmpError=True,siteMapper=None):
        self.job = None
        self.jobID = jobID
        self.jobStatus = jobStatus
        self.taskBuffer = taskBuffer
        self.ignoreTmpError = ignoreTmpError
        self.lockXML = None
        self.siteMapper = siteMapper
        self.attemptNr = None
        self.xmlFile = xmlFile
        self.datasetMap = {}
        self.extraInfo = {'surl':{},'nevents':{},'lbnr':{},'endpoint':{}, 'guid':{}}
        # exstract attemptNr
        try:
            tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1]
            if re.search('^\d+$',tmpAttemptNr) != None:
                self.attemptNr = int(tmpAttemptNr)
        except:
            pass
        # logger
        self.logger = LogWrapper(_logger,str(self.jobID))


    # dump file report
    def dumpFileReport(self,fileCatalog,attemptNr):
        self.logger.debug("dump file report")
        # dump Catalog into file
        if attemptNr == None:
            xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,self.jobID,self.jobStatus,
                                       str(uuid.uuid4()))
        else:
            xmlFile = '%s/%s_%s_%s_%s' % (panda_config.logdir,self.jobID,self.jobStatus,
                                          str(uuid.uuid4()),attemptNr)
        file = open(xmlFile,'w')
        file.write(fileCatalog)
        file.close()


    # get plugin class
    def getPluginClass(self, tmpVO):
        # instantiate concrete plugin
        adderPluginClass = panda_config.getPlugin('adder_plugins',tmpVO)
        if adderPluginClass == None:
            # use ATLAS plugin by default
            from AdderAtlasPlugin import AdderAtlasPlugin
            adderPluginClass = AdderAtlasPlugin
        self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__))
        return adderPluginClass


    # main
    def run(self):
        try:
            self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus,self.attemptNr))
            # lock XML
            self.lockXML = open(self.xmlFile)
            try:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
            except:
                self.logger.debug("cannot get lock : %s" % self.xmlFile)
                self.lockXML.close()
                # remove XML just in case for the final attempt
                if not self.ignoreTmpError:
                    try:
                        # remove Catalog
                        os.remove(self.xmlFile)
                    except:
                        pass
                return
            # check if file exists
            if not os.path.exists(self.xmlFile):
                self.logger.debug("not exist : %s" % self.xmlFile)
                try:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                    self.lockXML.close()
                except:
                    pass
                return
            # query job
            self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
            # check if job has finished
            if self.job == None:
                self.logger.debug(': job not found in DB')
            elif self.job.jobStatus in ['finished','failed','unknown','merging']:
                self.logger.error(': invalid state -> %s' % self.job.jobStatus)
            elif self.attemptNr != None and self.job.attemptNr != self.attemptNr:
                self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr,self.attemptNr))
            elif self.attemptNr is not None and self.job.jobStatus == 'transferring':
                errMsg = 'XML with attemptNr for {0}'.format(self.job.jobStatus)
                self.logger.error(errMsg)
                # FIXME
                raise RuntimeError, errMsg
            elif self.jobStatus == EventServiceUtils.esRegStatus:
                # instantiate concrete plugin
                adderPluginClass = self.getPluginClass(self.job.VO)
                adderPlugin = adderPluginClass(self.job,
                                               taskBuffer=self.taskBuffer,
                                               siteMapper=self.siteMapper,
                                               logger=self.logger)
                # execute
                self.logger.debug('plugin is ready for ES file registration')
                adderPlugin.registerEventServiceFiles()
            else:
                # check file status in JEDI
                if not self.job.isCancelled() and not self.job.taskBufferErrorCode in [taskbuffer.ErrorCode.EC_PilotRetried]:
                    fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(self.job)
                    self.logger.debug("check file status in JEDI : {0}".format(fileCheckInJEDI))                
                    if fileCheckInJEDI == None:
                        raise RuntimeError,'failed to check file status in JEDI'
                    if fileCheckInJEDI == False:
                        # set job status to failed since some file status is wrong in JEDI 
                        self.jobStatus = 'failed'
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        errStr = "inconsistent file status between Panda and JEDI. "
                        errStr += "failed to avoid duplicated processing caused by synchronization failure"
                        self.job.ddmErrorDiag = errStr
                        self.logger.debug("set jobStatus={0} since input is inconsistent between Panda and JEDI".format(self.jobStatus))
                    elif self.job.jobSubStatus in ['pilot_closed']:
                        # terminated by the pilot
                        self.logger.debug("going to closed since terminated by the pilot")
                        retClosed = self.taskBuffer.killJobs([self.jobID],'pilot','60',True)
                        if retClosed[0] == True:
                            self.logger.debug("end")
                            try:
                                # remove Catalog
                                os.remove(self.xmlFile)
                            except:
                                pass
                            # unlock XML
                            if self.lockXML != None:
                                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                                self.lockXML.close()
                            return
                    # check for cloned jobs
                    if EventServiceUtils.isJobCloningJob(self.job):
                        checkJC = self.taskBuffer.checkClonedJob(self.job)
                        if checkJC == None:
                            raise RuntimeError,'failed to check the cloned job'
                        # failed to lock semaphore
                        if checkJC['lock'] == False:
                            self.jobStatus = 'failed'
                            self.job.ddmErrorCode = ErrorCode.EC_Adder
                            self.job.ddmErrorDiag = "failed to lock semaphore for job cloning"
                            self.logger.debug("set jobStatus={0} since did not get semaphore for job cloning".format(self.jobStatus))
                # use failed for cancelled/closed jobs
                if self.job.isCancelled():
                    self.jobStatus = 'failed'
                    # reset error codes to skip retrial module
                    self.job.pilotErrorCode = 0
                    self.job.exeErrorCode = 0
                    self.job.ddmErrorCode = 0
                # keep old status
                oldJobStatus = self.job.jobStatus
                # set job status
                if not self.job.jobStatus in ['transferring']:
                    self.job.jobStatus = self.jobStatus
                addResult = None
                adderPlugin = None
                # parse XML
                parseResult = self.parseXML()
                if parseResult < 2:
                    # intraction with DDM
                    try:
                        # instantiate concrete plugin
                        adderPluginClass = self.getPluginClass(self.job.VO)
                        adderPlugin = adderPluginClass(self.job,
                                                       taskBuffer=self.taskBuffer,
                                                       siteMapper=self.siteMapper,
                                                       extraInfo=self.extraInfo,
                                                       logger=self.logger)
                        # execute
                        self.logger.debug('plugin is ready')
                        adderPlugin.execute()
                        addResult = adderPlugin.result
                        self.logger.debug('plugin done with %s' % (addResult.statusCode))
                    except:
                        errtype,errvalue = sys.exc_info()[:2]
                        self.logger.error("failed to execute AdderPlugin for VO={0} with {1}:{2}".format(self.job.VO,
                                                                                                         errtype,
                                                                                                         errvalue)) 
                        addResult = None
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "AdderPlugin failure"
                        
                    # ignore temporary errors
                    if self.ignoreTmpError and addResult != None and addResult.isTemporary():
                        self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag)
                        self.logger.debug('escape')
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type,value))
                            self.logger.debug("cannot unlock XML")
                        return
                    # failed
                    if addResult == None or not addResult.isSucceeded():
                        self.job.jobStatus = 'failed'
                # set file status for failed jobs or failed transferring jobs
                self.logger.debug("status after plugin call :job.jobStatus=%s jobStatus=%s" % (self.job.jobStatus, self.jobStatus))
                if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                    # First of all: check if job failed and in this case take first actions according to error table
                    source, error_code, error_diag = None, None, None
                    if self.job.pilotErrorCode:
                        source = 'pilotErrorCode'
                        error_code = self.job.pilotErrorCode
                        error_diag = self.job.pilotErrorDiag
                    elif self.job.exeErrorCode:
                        source = 'exeErrorCode'
                        error_code = self.job.exeErrorCode
                        error_diag = self.job.exeErrorDiag
                    elif self.job.ddmErrorCode:
                        source = 'ddmErrorCode'
                        error_code = self.job.ddmErrorCode
                        error_diag = self.job.ddmErrorDiag
                    elif self.job.transExitCode:
                        source = 'transExitCode'
                        error_code = self.job.transExitCode
                        error_diag = ''
            
                    # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag))
                    
                    if source and error_code:
                        try:
                            self.logger.debug("AdderGen.run will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(self.taskBuffer, self.job.PandaID, source, error_code, error_diag, self.job.attemptNr)
                            self.logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            self.logger.error("apply_retrial_rules excepted and needs to be investigated (%s): %s"%(e, traceback.format_exc()))
                    
                    self.job.jobStatus = 'failed'
                    for file in self.job.Files:
                        if file.type in ['output','log']:
                            if addResult != None and file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                            else:
                                file.status = 'failed'
                else:
                    # reset errors
                    self.job.jobDispatcherErrorCode = 0
                    self.job.jobDispatcherErrorDiag = 'NULL'
                    # set status
                    if addResult != None and addResult.mergingFiles != []:
                        # set status for merging:                        
                        for file in self.job.Files:
                            if file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                        self.job.jobStatus = 'merging'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                    elif addResult != None and addResult.transferringFiles != []:
                        # set status for transferring
                        for file in self.job.Files:
                            if file.lfn in addResult.transferringFiles:
                                file.status = 'transferring'
                        self.job.jobStatus = 'transferring'
                        self.job.jobSubStatus = None
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                    else:
                        self.job.jobStatus = 'finished'
                # endtime
                if self.job.endTime=='NULL':
                    self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                # output size and # of outputs
                self.job.nOutputDataFiles = 0
                self.job.outputFileBytes = 0
                for tmpFile in self.job.Files:
                    if tmpFile.type == 'output':
                        self.job.nOutputDataFiles += 1
                        try:
                            self.job.outputFileBytes += tmpFile.fsize
                        except:
                            pass
                # protection
                maxOutputFileBytes = 99999999999
                if self.job.outputFileBytes > maxOutputFileBytes:
                    self.job.outputFileBytes = maxOutputFileBytes
                # set cancelled state
                if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                    self.job.jobStatus = 'cancelled'
                # update job
                if oldJobStatus in ['cancelled','closed']:
                    pass
                else:
                    self.logger.debug("updating DB")
                    retU = self.taskBuffer.updateJobs([self.job],False,oldJobStatusList=[oldJobStatus],
                                                      extraInfo=self.extraInfo)
                    self.logger.debug("retU: %s" % retU)
                    # failed
                    if not retU[0]:
                        self.logger.error('failed to update DB for pandaid={0}'.format(self.job.PandaID))
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()                            
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type,value))
                            self.logger.debug("cannot unlock XML")
                        return

                    try:
                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        self.logger.debug("AdderGen.run will peek the job")
                        job_tmp = self.taskBuffer.peekJobs([self.job.PandaID], fromDefined=False, fromArchived=True,
                                                           fromWaiting=False)[0]
                        self.logger.debug("status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}".format(job_tmp.jobStatus,
                                                                                                                job_tmp.taskBufferErrorCode,
                                                                                                                job_tmp.taskBufferErrorDiag))
                        if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode:
                            source = 'taskBufferErrorCode'
                            error_code = job_tmp.taskBufferErrorCode
                            error_diag = job_tmp.taskBufferErrorDiag
                            self.logger.debug("AdderGen.run 2 will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(self.taskBuffer, job_tmp.PandaID, source, error_code,
                                                            error_diag, job_tmp.attemptNr)
                            self.logger.debug("apply_retrial_rules 2 is back")
                    except IndexError:
                        pass
                    except Exception as e:
                        self.logger.error("apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc()))

                    # setup for closer
                    if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.isCancelled()):
                        destDBList = []
                        guidList = []
                        for file in self.job.Files:
                            # ignore inputs
                            if file.type == 'input':
                                continue
                            # skip pseudo datasets
                            if file.destinationDBlock in ['',None,'NULL']:
                                continue
                            # start closer for output/log datasets
                            if not file.destinationDBlock in destDBList:
                                destDBList.append(file.destinationDBlock)
                            # collect GUIDs
                            if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['rucio_test'] + JobUtils.list_ptest_prod_sources and \
                                                                      self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                                      and file.type == 'output':
                                # extract base LFN since LFN was changed to full LFN for CMS
                                baseLFN = file.lfn.split('/')[-1]
                                guidList.append({'lfn':baseLFN,'guid':file.GUID,'type':file.type,
                                                 'checksum':file.checksum,'md5sum':file.md5sum,
                                                 'fsize':file.fsize,'scope':file.scope})
                        if guidList != []:
                            retG = self.taskBuffer.setGUIDs(guidList)
                        if destDBList != []:
                            # start Closer
                            if adderPlugin != None and hasattr(adderPlugin,'datasetMap') and adderPlugin.datasetMap != {}:
                                cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,datasetMap=adderPlugin.datasetMap)
                            else:
                                cThr = Closer.Closer(self.taskBuffer,destDBList,self.job)
                            self.logger.debug("start Closer")
                            cThr.start()
                            cThr.join()
                            self.logger.debug("end Closer")
                        # run closer for assocaiate parallel jobs
                        if EventServiceUtils.isJobCloningJob(self.job):
                            assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer(self.job.jediTaskID,self.job.PandaID,
                                                                                            destDBList)
                            for assJobID,assDBlocks in assDBlockMap.iteritems():
                                assJob = self.taskBuffer.peekJobs([assJobID],fromDefined=False,
                                                                  fromArchived=False,
                                                                  fromWaiting=False,
                                                                  forAnal=True)[0]
                                if self.job == None:
                                    self.logger.debug(': associated job PandaID={0} not found in DB'.format(assJobID))
                                else:
                                    cThr = Closer.Closer(self.taskBuffer,assDBlocks,assJob)
                                    self.logger.debug("start Closer for PandaID={0}".format(assJobID))
                                    cThr.start()
                                    cThr.join()
                                    self.logger.debug("end Closer for PandaID={0}".format(assJobID))
            self.logger.debug("end")
            try:
                # remove Catalog
                os.remove(self.xmlFile)
            except:
                pass
            # unlock XML
            if self.lockXML != None:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                self.lockXML.close()            
        except:
            type, value, traceBack = sys.exc_info()
            errStr = ": %s %s " % (type,value)
            errStr += traceback.format_exc()
            self.logger.error(errStr)
            self.logger.error("except")
            # unlock XML just in case
            try:
                if self.lockXML != None:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
            except:
                type, value, traceBack = sys.exc_info()
                self.logger.error(": %s %s" % (type,value))
                self.logger.error("cannot unlock XML")


    # parse XML
    # 0: succeeded, 1: harmless error to exit, 2: fatal error, 3: event service
    def parseXML(self):
        # get LFN and GUID
        self.logger.debug('XML filename : %s' % self.xmlFile)
        # no outputs
        if self.job.Files == []:
            self.logger.debug("has no outputs")
            self.logger.debug("parseXML end")
            return 0
        # get input files
        inputLFNs = []
        for file in self.job.Files:
            if file.type == 'input':
                inputLFNs.append(file.lfn)
        # parse XML
        lfns    = []
        guids   = []
        fsizes  = []
        md5sums = []
        chksums = []
        surls   = []
        fullLfnMap = {}
        nEventsMap = {}
        guidMap = dict()
        try:
            root  = xml.dom.minidom.parse(self.xmlFile)
            files = root.getElementsByTagName('File')
            for file in files:
                # get GUID
                guid = str(file.getAttribute('ID'))
                # get PFN and LFN nodes
                logical  = file.getElementsByTagName('logical')[0]
                lfnNode  = logical.getElementsByTagName('lfn')[0]
                # convert UTF8 to Raw
                lfn = str(lfnNode.getAttribute('name'))
                # get metadata
                fsize   = None
                md5sum  = None
                adler32 = None
                surl    = None
                fullLFN = None
                for meta in file.getElementsByTagName('metadata'):
                    # get fsize
                    name = str(meta.getAttribute('att_name'))
                    if name == 'fsize':
                        fsize = long(meta.getAttribute('att_value'))
                    elif name == 'md5sum':
                        md5sum = str(meta.getAttribute('att_value'))
                        # check
                        if re.search("^[a-fA-F0-9]{32}$",md5sum) == None:
                            md5sum = None
                    elif name == 'adler32':
                        adler32 = str(meta.getAttribute('att_value'))
                    elif name == 'surl':
                        surl = str(meta.getAttribute('att_value'))
                    elif name == 'full_lfn':
                        fullLFN = str(meta.getAttribute('att_value'))
                # endpoints
                self.extraInfo['endpoint'][lfn] = []
                for epNode in file.getElementsByTagName('endpoint'):
                    self.extraInfo['endpoint'][lfn].append(str(epNode.firstChild.data))
                # error check
                if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)):
                    if EventServiceUtils.isEventServiceMerge(self.job):
                        continue
                    else:
                        raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
                # append
                lfns.append(lfn)
                guids.append(guid)
                fsizes.append(fsize)
                md5sums.append(md5sum)
                surls.append(surl)
                if adler32 != None:
                    # use adler32 if available
                    chksums.append("ad:%s" % adler32)
                else:
                    chksums.append("md5:%s" % md5sum)
                if fullLFN != None:
                    fullLfnMap[lfn] = fullLFN
        except:
            # parse json
            try:
                import json
                with open(self.xmlFile) as tmpF:
                    jsonDict = json.load(tmpF)
                    for lfn, fileData in jsonDict.iteritems():
                        lfn = str(lfn)
                        fsize   = None
                        md5sum  = None
                        adler32 = None
                        surl    = None
                        fullLFN = None
                        guid = str(fileData['guid'])
                        if 'fsize' in fileData:
                            fsize = long(fileData['fsize'])
                        if 'md5sum' in fileData:
                            md5sum = str(fileData['md5sum'])
                            # check
                            if re.search("^[a-fA-F0-9]{32}$",md5sum) == None:
                                md5sum = None
                        if 'adler32' in fileData:
                            adler32 = str(fileData['adler32'])
                        if 'surl' in fileData:
                            surl = str(fileData['surl'])
                        if 'full_lfn' in fileData:
                            fullLFN = str(fileData['full_lfn'])
                        # endpoints
                        self.extraInfo['endpoint'][lfn] = []
                        if 'endpoint' in fileData:
                            self.extraInfo['endpoint'][lfn] = fileData['endpoint']
                        # error check
                        if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)):
                            if EventServiceUtils.isEventServiceMerge(self.job):
                                continue
                            else:
                                raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
                        # append
                        lfns.append(lfn)
                        guids.append(guid)
                        fsizes.append(fsize)
                        md5sums.append(md5sum)
                        surls.append(surl)
                        if adler32 != None:
                            # use adler32 if available
                            chksums.append("ad:%s" % adler32)
                        else:
                            chksums.append("md5:%s" % md5sum)
                        if fullLFN != None:
                            fullLfnMap[lfn] = fullLFN
            except:
                # check if file exists
                if os.path.exists(self.xmlFile):
                    type, value, traceBack = sys.exc_info()
                    self.logger.error(": %s %s" % (type,value))
                    # set failed anyway
                    self.job.jobStatus = 'failed'
                    # XML error happens when pilot got killed due to wall-time limit or failures in wrapper
                    if (self.job.pilotErrorCode in [0,'0','NULL']) and \
                       (self.job.taskBufferErrorCode not in [taskbuffer.ErrorCode.EC_WorkerDone]) and \
                       (self.job.transExitCode  in [0,'0','NULL']):
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML"
                    return 2
                else:
                    # XML was deleted
                    return 1
        # parse metadata to get nEvents
        try:
            root  = xml.dom.minidom.parseString(self.job.metadata)
            files = root.getElementsByTagName('File')
            for file in files:
                # get GUID
                guid = str(file.getAttribute('ID'))
                # get PFN and LFN nodes
                logical  = file.getElementsByTagName('logical')[0]
                lfnNode  = logical.getElementsByTagName('lfn')[0]
                # convert UTF8 to Raw
                lfn = str(lfnNode.getAttribute('name'))
                guidMap[lfn] = guid
                # get metadata
                nevents = None
                for meta in file.getElementsByTagName('metadata'):
                    # get fsize
                    name = str(meta.getAttribute('att_name'))
                    if name == 'events':
                        nevents = long(meta.getAttribute('att_value'))
                        nEventsMap[lfn] = nevents
                        break
        except:
            pass
        # parse json
        try:
            import json
            jsonDict = json.loads(self.job.metadata)
            for jsonFileItem in jsonDict['files']['output']:
                for jsonSubFileItem in jsonFileItem['subFiles']:
                    lfn = str(jsonSubFileItem['name'])
                    try:
                        nevents = long(jsonSubFileItem['nentries'])
                        nEventsMap[lfn] = nevents
                    except:
                        pass
                    try:
                        guid = str(jsonSubFileItem['file_guid'])
                        guidMap[lfn] = guid
                    except:
                        pass
        except:
            pass
        self.logger.debug('nEventsMap=%s' % str(nEventsMap))
        self.logger.debug('guidMap=%s' % str(guidMap))
        # get lumi block number
        lumiBlockNr = self.job.getLumiBlockNr()
        # copy files for variable number of outputs
        tmpStat = self.copyFilesForVariableNumOutputs(lfns)
        if not tmpStat:
            self.logger.error("failed to copy files for variable number of outputs")
            return 2
        # check files
        fileList = []
        for file in self.job.Files:
            fileList.append(file.lfn)
            if file.type == 'input':
                if file.lfn in lfns:
                    if self.job.prodSourceLabel in ['user','panda']:
                        # skipped file
                        file.status = 'skipped'
                    elif self.job.prodSourceLabel in ['managed','test'] + JobUtils.list_ptest_prod_sources:
                        # failed by pilot
                        file.status = 'failed'
            elif file.type == 'output' or file.type == 'log':
                # add only log file for failed jobs
                if self.jobStatus == 'failed' and file.type != 'log':
                    file.status = 'failed'
                    continue
                # set failed if it is missing in XML
                if not file.lfn in lfns:
                    if self.job.jobStatus == 'finished' and \
                            (EventServiceUtils.isEventServiceJob(self.job) or EventServiceUtils.isJumboJob(self.job)):
                        # unset file status for ES jobs
                        pass
                    elif file.isAllowedNoOutput():
                        # allowed not to be produced
                        file.status = 'nooutput'
                        self.logger.debug('set {0} to status={1}'.format(file.lfn,file.status))
                    else:
                        file.status = 'failed'
                        self.job.jobStatus = 'failed'
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format(file.lfn)
                        self.logger.error(self.job.ddmErrorDiag)
                    continue
                # look for GUID with LFN
                try:
                    i = lfns.index(file.lfn)
                    file.GUID   = guids[i]
                    file.fsize  = fsizes[i]
                    file.md5sum = md5sums[i]
                    file.checksum = chksums[i]
                    surl = surls[i]
                    # status
                    file.status = 'ready'
                    # change to full LFN
                    if fullLfnMap.has_key(file.lfn):
                        file.lfn = fullLfnMap[file.lfn]
                    # add SURL to extraInfo
                    self.extraInfo['surl'][file.lfn] = surl
                    # add nevents 
                    if nEventsMap.has_key(file.lfn):
                        self.extraInfo['nevents'][file.lfn] = nEventsMap[file.lfn]
                except:
                    # status
                    file.status = 'failed'
                    type, value, traceBack = sys.exc_info()
                    self.logger.error(": %s %s" % (type,value))
                # set lumi block number
                if lumiBlockNr != None and file.status != 'failed':
                    self.extraInfo['lbnr'][file.lfn] = lumiBlockNr 
        self.extraInfo['guid'] = guidMap
        # check consistency between XML and filesTable
        for lfn in lfns:
            if not lfn in fileList:
                self.logger.error("%s is not found in filesTable" % lfn)
                self.job.jobStatus = 'failed'
                for tmpFile in self.job.Files:
                    tmpFile.status = 'failed'
                self.job.ddmErrorCode = ErrorCode.EC_Adder
                self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format(lfn)
                return 2
        # return
        self.logger.debug("parseXML end")
        return 0



    # copy files for variable number of outputs
    def copyFilesForVariableNumOutputs(self,lfns):
        # get original output files
        origOutputs = {}
        updateOrig  = {}
        for tmpFile in self.job.Files:
            if tmpFile.type in ['output','log']:
                origOutputs[tmpFile.lfn] = tmpFile
                if tmpFile.lfn in lfns:
                    # keep original
                    updateOrig[tmpFile.lfn] = False
                else:
                    # overwrite original
                    updateOrig[tmpFile.lfn] = True
        # look for unkown files
        addedNewFiles = False
        for newLFN in lfns:
            if not newLFN in origOutputs:
                # look for corresponding original output
                for origLFN in origOutputs.keys():
                    tmpPatt = '^{0}\.*_\d+$'.format(origLFN)
                    if re.search(tmpPatt,newLFN) != None:
                        # copy file record
                        tmpStat = self.taskBuffer.copyFileRecord(newLFN,origOutputs[origLFN],updateOrig[origLFN])
                        if not tmpStat:
                            return False
                        addedNewFiles = True
                        # disable further overwriting
                        updateOrig[origLFN] = False
                        break
        # refresh job info
        if addedNewFiles:
            self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
        # return
        return True
Пример #25
0
 def run(self):
     try:
         # make a message instance
         tmpLog = LogWrapper(_logger,None)
         # run main procedure in the same process
         if not self.forkRun:
             tmpLog.debug('main start')
             tmpLog.debug('firstSubmission={0}'.format(self.firstSubmission))
             # group jobs per VO
             voJobsMap = {}
             ddmFreeJobs = []
             tmpLog.debug('{0} jobs in total'.format(len(self.jobs)))
             for tmpJob in self.jobs:
                 # set VO=local for DDM free 
                 if tmpJob.destinationSE == 'local':
                     tmpVO = 'local'
                 else:
                     tmpVO = tmpJob.VO
                 # make map
                 if not voJobsMap.has_key(tmpVO):
                     voJobsMap[tmpVO] = []
                 voJobsMap[tmpVO].append(tmpJob)
             # loop over all VOs
             for tmpVO,tmpJobList in voJobsMap.iteritems():
                 tmpLog.debug('vo={0} has {1} jobs'.format(tmpVO,len(tmpJobList)))
                 # get plugin
                 setupperPluginClass = panda_config.getPlugin('setupper_plugins',tmpVO)
                 if setupperPluginClass == None:
                     # use ATLAS plug-in by default
                     from SetupperAtlasPlugin import SetupperAtlasPlugin
                     setupperPluginClass = SetupperAtlasPlugin
                 tmpLog.debug('plugin name -> {0}'.format(setupperPluginClass.__name__))
                 try:
                     # make plugin
                     setupperPlugin = setupperPluginClass(self.taskBuffer,self.jobs,tmpLog,
                                                          resubmit=self.resubmit,
                                                          pandaDDM=self.pandaDDM,
                                                          ddmAttempt=self.ddmAttempt,
                                                          onlyTA=self.onlyTA,
                                                          firstSubmission=self.firstSubmission)
                     # run plugin
                     tmpLog.debug('run plugin')
                     setupperPlugin.run()
                     # go forward if not TA
                     if not self.onlyTA:
                         # update jobs
                         tmpLog.debug('update jobs')
                         self.updateJobs(setupperPlugin.jobs+setupperPlugin.jumboJobs,tmpLog)
                         # execute post process
                         tmpLog.debug('post execute plugin')
                         setupperPlugin.postRun()
                     tmpLog.debug('done plugin')
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     tmpLog.error('plugin failed with {0}:{1}'.format(errtype, errvalue))
             tmpLog.debug('main end')
         else:
             tmpLog.debug('fork start')
             # write jobs to file
             import os
             import cPickle as pickle
             outFileName = '%s/set.%s_%s' % (panda_config.logdir,self.jobs[0].PandaID,commands.getoutput('uuidgen'))
             outFile = open(outFileName,'w')
             pickle.dump(self.jobs,outFile)
             outFile.close()
             # run main procedure in another process because python doesn't release memory
             com =  'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd)
             com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \
                    (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python,
                     panda_config.pandaPython_dir,outFileName)
             if self.onlyTA:
                 com += " -t"
             if not self.firstSubmission:
                 com += " -f"
             tmpLog.debug(com)
             # exeute
             status,output = self.taskBuffer.processLimiter.getstatusoutput(com)
             tmpLog.debug("return from main process: %s %s" % (status,output))                
             tmpLog.debug('fork end')
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('master failed with {0}:{1}'.format(errtype,errvalue))
Пример #26
0
 def run(self):
     # get logger
     tmpLog = LogWrapper(_logger,'<vuid={0} site={1} name={2}>'.format(self.vuid,
                                                                       self.site,
                                                                       self.dataset))
     # query dataset
     tmpLog.debug("start")
     if self.vuid != None:
         dataset = self.taskBuffer.queryDatasetWithMap({'vuid':self.vuid})
     else:
         dataset = self.taskBuffer.queryDatasetWithMap({'name':self.dataset})
     if dataset == None:
         tmpLog.error("Not found")
         tmpLog.debug("end")
         return
     tmpLog.debug("type:%s name:%s" % (dataset.type,dataset.name))
     if dataset.type == 'dispatch':
         # activate jobs in jobsDefined
         Activator(self.taskBuffer,dataset).start()
     if dataset.type == 'output':
         if dataset.name != None and re.search('^panda\..*_zip$',dataset.name) != None:
             # start unmerge jobs
             Activator(self.taskBuffer,dataset,enforce=True).start()
         else:
             # finish transferring jobs
             Finisher(self.taskBuffer,dataset,site=self.site).start()
     tmpLog.debug("end")
Пример #27
0
def updateJob(req,
              jobId,
              state,
              token=None,
              transExitCode=None,
              pilotErrorCode=None,
              pilotErrorDiag=None,
              timestamp=None,
              timeout=60,
              xml='',
              node=None,
              workdir=None,
              cpuConsumptionTime=None,
              cpuConsumptionUnit=None,
              remainingSpace=None,
              schedulerID=None,
              pilotID=None,
              siteName=None,
              messageLevel=None,
              pilotLog='',
              metaData='',
              cpuConversionFactor=None,
              exeErrorCode=None,
              exeErrorDiag=None,
              pilotTiming=None,
              computingElement=None,
              startTime=None,
              endTime=None,
              nEvents=None,
              nInputFiles=None,
              batchID=None,
              attemptNr=None,
              jobMetrics=None,
              stdout='',
              jobSubStatus=None,
              coreCount=None,
              maxRSS=None,
              maxVMEM=None,
              maxSWAP=None,
              maxPSS=None,
              avgRSS=None,
              avgVMEM=None,
              avgSWAP=None,
              avgPSS=None):
    tmpLog = LogWrapper(
        _logger, 'updateJob PandaID={0} PID={1}'.format(jobId, os.getpid()))
    tmpLog.debug('start')
    # get DN
    realDN = _getDN(req)
    # get FQANs
    fqans = _getFQAN(req)
    # check production role
    prodManager = _checkRole(fqans,
                             realDN,
                             jobDispatcher,
                             site=siteName,
                             hostname=req.get_remote_host())
    # check token
    validToken = _checkToken(token, jobDispatcher)
    # accept json
    acceptJson = req.acceptJson()
    _logger.debug(
        "updateJob(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,attemptNr:%s,jobSubStatus:%s,core:%s,DN:%s,role:%s,token:%s,val:%s,FQAN:%s,maxRSS=%s,maxVMEM=%s,maxSWAP=%s,maxPSS=%s,avgRSS=%s,avgVMEM=%s,avgSWAP=%s,avgPSS=%s\n==XML==\n%s\n==LOG==\n%s\n==Meta==\n%s\n==Metrics==\n%s\n==stdout==\n%s)"
        % (jobId, state, transExitCode, pilotErrorCode, pilotErrorDiag, node,
           workdir, cpuConsumptionTime, cpuConsumptionUnit, remainingSpace,
           schedulerID, pilotID, siteName, messageLevel, nEvents, nInputFiles,
           cpuConversionFactor, exeErrorCode, exeErrorDiag, pilotTiming,
           computingElement, startTime, endTime, batchID, attemptNr,
           jobSubStatus, coreCount, realDN, prodManager, token, validToken,
           str(fqans), maxRSS, maxVMEM, maxSWAP, maxPSS, avgRSS, avgVMEM,
           avgSWAP, avgPSS, xml, pilotLog, metaData, jobMetrics, stdout))
    _pilotReqLogger.info('method=updateJob,site=%s,node=%s,type=None' %
                         (siteName, node))
    # invalid role
    if not prodManager:
        _logger.warning("updateJob(%s) : invalid role" % jobId)
        return Protocol.Response(Protocol.SC_Role).encode(acceptJson)
    # invalid token
    if not validToken:
        _logger.warning("updateJob(%s) : invalid token" % jobId)
        return Protocol.Response(Protocol.SC_Invalid).encode(acceptJson)
    # aborting message
    if jobId == 'NULL':
        return Protocol.Response(Protocol.SC_Success).encode(acceptJson)
    # check status
    if not state in [
            'running', 'failed', 'finished', 'holding', 'starting',
            'transferring'
    ]:
        _logger.warning("invalid state=%s for updateJob" % state)
        return Protocol.Response(Protocol.SC_Success).encode(acceptJson)
    # pilot log
    tmpLog.debug('sending log')
    if pilotLog != '':
        try:
            # make message
            message = pilotLog
            # get logger
            _pandaLogger = PandaLogger()
            _pandaLogger.lock()
            _pandaLogger.setParams({'Type': 'pilotLog', 'PandaID': int(jobId)})
            logger = _pandaLogger.getHttpLogger(panda_config.loggername)
            # add message
            logger.info(message)
        except:
            tmpLog.debug('failed to send log')
        finally:
            tmpLog.debug('release lock')
            try:
                # release HTTP handler
                _pandaLogger.release()
            except:
                pass
    tmpLog.debug('done log')
    # create parameter map
    param = {}
    if cpuConsumptionTime != None:
        param['cpuConsumptionTime'] = cpuConsumptionTime
    if cpuConsumptionUnit != None:
        param['cpuConsumptionUnit'] = cpuConsumptionUnit
    if node != None:
        param['modificationHost'] = node[:128]
    if transExitCode != None:
        param['transExitCode'] = transExitCode
    if pilotErrorCode != None:
        param['pilotErrorCode'] = pilotErrorCode
    if pilotErrorDiag != None:
        param['pilotErrorDiag'] = pilotErrorDiag[:500]
    if jobMetrics != None:
        param['jobMetrics'] = jobMetrics[:500]
    if schedulerID != None:
        param['schedulerID'] = schedulerID
    if pilotID != None:
        param['pilotID'] = pilotID[:200]
    if batchID != None:
        param['batchID'] = batchID[:80]
    if exeErrorCode != None:
        param['exeErrorCode'] = exeErrorCode
    if exeErrorDiag != None:
        param['exeErrorDiag'] = exeErrorDiag[:500]
    if cpuConversionFactor != None:
        param['cpuConversion'] = cpuConversionFactor
    if pilotTiming != None:
        param['pilotTiming'] = pilotTiming
    if computingElement != None:
        param['computingElement'] = computingElement
    if nEvents != None:
        param['nEvents'] = nEvents
    if nInputFiles != None:
        param['nInputFiles'] = nInputFiles
    if not jobSubStatus in [None, '']:
        param['jobSubStatus'] = jobSubStatus
    if not coreCount in [None, '']:
        param['actualCoreCount'] = coreCount
    if maxRSS != None:
        param['maxRSS'] = maxRSS
    if maxVMEM != None:
        param['maxVMEM'] = maxVMEM
    if maxSWAP != None:
        param['maxSWAP'] = maxSWAP
    if maxPSS != None:
        param['maxPSS'] = maxPSS
    if avgRSS != None:
        param['avgRSS'] = avgRSS
    if avgVMEM != None:
        param['avgVMEM'] = avgVMEM
    if avgSWAP != None:
        param['avgSWAP'] = avgSWAP
    if avgPSS != None:
        param['avgPSS'] = avgPSS
    if startTime != None:
        try:
            param['startTime'] = datetime.datetime(
                *time.strptime(startTime, '%Y-%m-%d %H:%M:%S')[:6])
        except:
            pass
    if endTime != None:
        try:
            param['endTime'] = datetime.datetime(
                *time.strptime(endTime, '%Y-%m-%d %H:%M:%S')[:6])
        except:
            pass
    if attemptNr != None:
        try:
            attemptNr = int(attemptNr)
        except:
            attemptNr = None
    if stdout != '':
        stdout = stdout[:2048]
    # invoke JD
    tmpLog.debug('executing')
    return jobDispatcher.updateJob(int(jobId), state, int(timeout), xml,
                                   siteName, param, metaData, attemptNr,
                                   stdout, acceptJson)
Пример #28
0
class EventPicker:
    # constructor
    def __init__(self, taskBuffer, siteMapper, evpFileName, ignoreError):
        self.taskBuffer = taskBuffer
        self.siteMapper = siteMapper
        self.ignoreError = ignoreError
        self.evpFileName = evpFileName
        self.token = datetime.datetime.utcnow().isoformat(' ')
        # logger
        self.logger = LogWrapper(_logger, self.token)
        self.pd2p = DynDataDistributer.DynDataDistributer([],
                                                          self.taskBuffer,
                                                          self.siteMapper,
                                                          token=' ',
                                                          logger=self.logger)
        self.userDatasetName = ''
        self.creationTime = ''
        self.params = ''
        self.lockedBy = ''
        self.evpFile = None
        self.userTaskName = ''
        # message buffer
        self.msgBuffer = []
        self.lineLimit = 100
        # JEDI
        self.jediTaskID = None

    # main
    def run(self):
        try:
            self.putLog('start %s' % self.evpFileName)
            # lock evp file
            self.evpFile = open(self.evpFileName)
            try:
                fcntl.flock(self.evpFile.fileno(),
                            fcntl.LOCK_EX | fcntl.LOCK_NB)
            except:
                # relase
                self.putLog("cannot lock %s" % self.evpFileName)
                self.evpFile.close()
                return True
            # options
            runEvtList = []
            eventPickDataType = ''
            eventPickStreamName = ''
            eventPickDS = []
            eventPickAmiTag = ''
            eventPickNumSites = 1
            inputFileList = []
            tagDsList = []
            tagQuery = ''
            tagStreamRef = ''
            skipDaTRI = False
            runEvtGuidMap = {}
            # read evp file
            for tmpLine in self.evpFile:
                tmpMatch = re.search('^([^=]+)=(.+)$', tmpLine)
                # check format
                if tmpMatch == None:
                    continue
                tmpItems = tmpMatch.groups()
                if tmpItems[0] == 'runEvent':
                    # get run and event number
                    tmpRunEvt = tmpItems[1].split(',')
                    if len(tmpRunEvt) == 2:
                        runEvtList.append(tmpRunEvt)
                elif tmpItems[0] == 'eventPickDataType':
                    # data type
                    eventPickDataType = tmpItems[1]
                elif tmpItems[0] == 'eventPickStreamName':
                    # stream name
                    eventPickStreamName = tmpItems[1]
                elif tmpItems[0] == 'eventPickDS':
                    # dataset pattern
                    eventPickDS = tmpItems[1].split(',')
                elif tmpItems[0] == 'eventPickAmiTag':
                    # AMI tag
                    eventPickAmiTag = tmpItems[1]
                elif tmpItems[0] == 'eventPickNumSites':
                    # the number of sites where datasets are distributed
                    try:
                        eventPickNumSites = int(tmpItems[1])
                    except:
                        pass
                elif tmpItems[0] == 'userName':
                    # user name
                    self.userDN = tmpItems[1]
                    self.putLog("user=%s" % self.userDN)
                elif tmpItems[0] == 'userTaskName':
                    # user task name
                    self.userTaskName = tmpItems[1]
                elif tmpItems[0] == 'userDatasetName':
                    # user dataset name
                    self.userDatasetName = tmpItems[1]
                elif tmpItems[0] == 'lockedBy':
                    # client name
                    self.lockedBy = tmpItems[1]
                elif tmpItems[0] == 'creationTime':
                    # creation time
                    self.creationTime = tmpItems[1]
                elif tmpItems[0] == 'params':
                    # parameters
                    self.params = tmpItems[1]
                elif tmpItems[0] == 'inputFileList':
                    # input file list
                    inputFileList = tmpItems[1].split(',')
                    try:
                        inputFileList.remove('')
                    except:
                        pass
                elif tmpItems[0] == 'tagDS':
                    # TAG dataset
                    tagDsList = tmpItems[1].split(',')
                elif tmpItems[0] == 'tagQuery':
                    # query for TAG
                    tagQuery = tmpItems[1]
                elif tmpItems[0] == 'tagStreamRef':
                    # StreamRef for TAG
                    tagStreamRef = tmpItems[1]
                    if not tagStreamRef.endswith('_ref'):
                        tagStreamRef += '_ref'
                elif tmpItems[0] == 'runEvtGuidMap':
                    # GUIDs
                    try:
                        exec "runEvtGuidMap=" + tmpItems[1]
                    except:
                        pass
            # extract task name
            if self.userTaskName == '' and self.params != '':
                try:
                    tmpMatch = re.search('--outDS(=| ) *([^ ]+)', self.params)
                    if tmpMatch != None:
                        self.userTaskName = tmpMatch.group(2)
                        if not self.userTaskName.endswith('/'):
                            self.userTaskName += '/'
                except:
                    pass
            # suppress DaTRI
            if self.params != '':
                if '--eventPickSkipDaTRI' in self.params:
                    skipDaTRI = True
            # get compact user name
            compactDN = self.taskBuffer.cleanUserID(self.userDN)
            # get jediTaskID
            self.jediTaskID = self.taskBuffer.getTaskIDwithTaskNameJEDI(
                compactDN, self.userTaskName)
            # convert
            if tagDsList == [] or tagQuery == '':
                # convert run/event list to dataset/file list
                tmpRet, locationMap, allFiles = self.pd2p.convertEvtRunToDatasets(
                    runEvtList, eventPickDataType, eventPickStreamName,
                    eventPickDS, eventPickAmiTag, self.userDN, runEvtGuidMap)
                if not tmpRet:
                    if 'isFatal' in locationMap and locationMap[
                            'isFatal'] == True:
                        self.ignoreError = False
                    self.endWithError(
                        'Failed to convert the run/event list to a dataset/file list'
                    )
                    return False
            else:
                # get parent dataset/files with TAG
                tmpRet, locationMap, allFiles = self.pd2p.getTagParentInfoUsingTagQuery(
                    tagDsList, tagQuery, tagStreamRef)
                if not tmpRet:
                    self.endWithError(
                        'Failed to get parent dataset/file list with TAG')
                    return False
            # use only files in the list
            if inputFileList != []:
                tmpAllFiles = []
                for tmpFile in allFiles:
                    if tmpFile['lfn'] in inputFileList:
                        tmpAllFiles.append(tmpFile)
                allFiles = tmpAllFiles
            # remove redundant CN from DN
            tmpDN = self.userDN
            tmpDN = re.sub('/CN=limited proxy', '', tmpDN)
            tmpDN = re.sub('(/CN=proxy)+$', '', tmpDN)
            # make dataset container
            tmpRet = self.pd2p.registerDatasetContainerWithDatasets(
                self.userDatasetName,
                allFiles,
                locationMap,
                nSites=eventPickNumSites,
                owner=tmpDN)
            if not tmpRet:
                self.endWithError('Failed to make a dataset container %s' %
                                  self.userDatasetName)
                return False
            # skip DaTRI
            if skipDaTRI:
                # successfully terminated
                self.putLog("skip DaTRI")
                # update task
                self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID)
            else:
                # get candidates
                tmpRet, candidateMaps = self.pd2p.getCandidates(
                    self.userDatasetName, checkUsedFile=False, useHidden=True)
                if not tmpRet:
                    self.endWithError(
                        'Failed to find candidate for destination')
                    return False
                # collect all candidates
                allCandidates = []
                for tmpDS, tmpDsVal in candidateMaps.iteritems():
                    for tmpCloud, tmpCloudVal in tmpDsVal.iteritems():
                        for tmpSiteName in tmpCloudVal[0]:
                            if not tmpSiteName in allCandidates:
                                allCandidates.append(tmpSiteName)
                if allCandidates == []:
                    self.endWithError('No candidate for destination')
                    return False
                # get list of dataset (container) names
                if eventPickNumSites > 1:
                    # decompose container to transfer datasets separately
                    tmpRet, tmpOut = self.pd2p.getListDatasetReplicasInContainer(
                        self.userDatasetName)
                    if not tmpRet:
                        self.endWithError('Failed to get the size of %s' %
                                          self.userDatasetName)
                        return False
                    userDatasetNameList = tmpOut.keys()
                else:
                    # transfer container at once
                    userDatasetNameList = [self.userDatasetName]
                # loop over all datasets
                sitesUsed = []
                for tmpUserDatasetName in userDatasetNameList:
                    # get size of dataset container
                    tmpRet, totalInputSize = rucioAPI.getDatasetSize(
                        tmpUserDatasetName)
                    if not tmpRet:
                        self.endWithError('Failed to get the size of %s' %
                                          tmpUserDatasetName)
                        return False
                    # run brokerage
                    tmpJob = JobSpec()
                    tmpJob.AtlasRelease = ''
                    self.putLog("run brokerage for %s" % tmpDS)
                    brokerage.broker.schedule([tmpJob],
                                              self.taskBuffer,
                                              self.siteMapper,
                                              True,
                                              allCandidates,
                                              True,
                                              datasetSize=totalInputSize)
                    if tmpJob.computingSite.startswith('ERROR'):
                        self.endWithError('brokerage failed with %s' %
                                          tmpJob.computingSite)
                        return False
                    self.putLog("site -> %s" % tmpJob.computingSite)
                    # send transfer request
                    try:
                        tmpDN = rucioAPI.parse_dn(tmpDN)
                        tmpStatus, userInfo = rucioAPI.finger(tmpDN)
                        if not tmpStatus:
                            raise RuntimeError, 'user info not found for {0} with {1}'.format(
                                tmpDN, userInfo)
                        tmpDN = userInfo['nickname']
                        tmpDQ2ID = self.siteMapper.getSite(
                            tmpJob.computingSite).ddm
                        tmpMsg = "%s ds=%s site=%s id=%s" % (
                            'registerDatasetLocation for DaTRI ',
                            tmpUserDatasetName, tmpDQ2ID, tmpDN)
                        self.putLog(tmpMsg)
                        rucioAPI.registerDatasetLocation(
                            tmpDS, [tmpDQ2ID],
                            lifetime=14,
                            owner=tmpDN,
                            activity="User Subscriptions")
                        self.putLog('OK')
                    except:
                        errType, errValue = sys.exc_info()[:2]
                        tmpStr = 'Failed to send transfer request : %s %s' % (
                            errType, errValue)
                        tmpStr.strip()
                        tmpStr += traceback.format_exc()
                        self.endWithError(tmpStr)
                        return False
                    # list of sites already used
                    sitesUsed.append(tmpJob.computingSite)
                    self.putLog("used %s sites" % len(sitesUsed))
                    # set candidates
                    if len(sitesUsed) >= eventPickNumSites:
                        # reset candidates to limit the number of sites
                        allCandidates = sitesUsed
                        sitesUsed = []
                    else:
                        # remove site
                        allCandidates.remove(tmpJob.computingSite)
                # send email notification for success
                tmpMsg = 'A transfer request was successfully sent to Rucio.\n'
                tmpMsg += 'Your task will get started once transfer is completed.'
                self.sendEmail(True, tmpMsg)
            try:
                # unlock and delete evp file
                fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_UN)
                self.evpFile.close()
                os.remove(self.evpFileName)
            except:
                pass
            # successfully terminated
            self.putLog("end %s" % self.evpFileName)
            return True
        except:
            errType, errValue = sys.exc_info()[:2]
            self.endWithError('Got exception %s:%s %s' %
                              (errType, errValue, traceback.format_exc()))
            return False

    # end with error
    def endWithError(self, message):
        self.putLog(message, 'error')
        # unlock evp file
        try:
            fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_UN)
            self.evpFile.close()
            if not self.ignoreError:
                # remove evp file
                os.remove(self.evpFileName)
                # send email notification
                self.sendEmail(False, message)
        except:
            pass
        # upload log
        if self.jediTaskID != None:
            outLog = self.uploadLog()
            self.taskBuffer.updateTaskErrorDialogJEDI(
                self.jediTaskID, 'event picking failed. ' + outLog)
            # update task
            if not self.ignoreError:
                self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID,
                                                      'tobroken')
            self.putLog(outLog)
        self.putLog('end %s' % self.evpFileName)

    # put log
    def putLog(self, msg, type='debug'):
        tmpMsg = msg
        if type == 'error':
            self.logger.error(tmpMsg)
        else:
            self.logger.debug(tmpMsg)

    # send email notification
    def sendEmail(self, isSucceeded, message):
        # mail address
        toAdder = Notifier(self.taskBuffer, None, []).getEmail(self.userDN)
        if toAdder == '':
            self.putLog('cannot find email address for %s' % self.userDN,
                        'error')
            return
        # subject
        mailSubject = "PANDA notification for Event-Picking Request"
        # message
        mailBody = "Hello,\n\nHere is your request status for event picking\n\n"
        if isSucceeded:
            mailBody += "Status  : Passed to Rucio\n"
        else:
            mailBody += "Status  : Failed\n"
        mailBody += "Created : %s\n" % self.creationTime
        mailBody += "Ended   : %s\n" % datetime.datetime.utcnow().strftime(
            '%Y-%m-%d %H:%M:%S')
        mailBody += "Dataset : %s\n" % self.userDatasetName
        mailBody += "\n"
        mailBody += "Parameters : %s %s\n" % (self.lockedBy, self.params)
        mailBody += "\n"
        mailBody += "%s\n" % message
        # send
        retVal = MailUtils().send(toAdder, mailSubject, mailBody)
        # return
        return

    # upload log
    def uploadLog(self):
        if self.jediTaskID == None:
            return 'cannot find jediTaskID'
        strMsg = self.logger.dumpToString()
        s, o = Client.uploadLog(strMsg, self.jediTaskID)
        if s != 0:
            return "failed to upload log with {0}.".format(s)
        if o.startswith('http'):
            return '<a href="{0}">log</a>'.format(o)
        return o
Пример #29
0
def uploadLog(req,file):
    if not Protocol.isSecure(req):
        return False
    if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']:
        return False
    tmpLog = LogWrapper(_logger,'uploadLog <{0}>'.format(file.filename))
    tmpLog.debug("start {0}".format(req.subprocess_env['SSL_CLIENT_S_DN']))
    # size check
    sizeLimit = 100*1024*1024
    # get file size
    contentLength = 0
    try:
        contentLength = long(req.headers_in["content-length"])
    except:
        if req.headers_in.has_key("content-length"):
            tmpLog.error("cannot get CL : %s" % req.headers_in["content-length"])
        else:
            tmpLog.error("no CL")
    tmpLog.debug("size %s" % contentLength)
    if contentLength > sizeLimit:
        errStr = "failed to upload log due to size limit"
        tmpLog.error(errStr)
        tmpLog.debug("end")            
        return errStr
    jediLogDir = '/jedilog'
    retStr = ''
    try:
        fileBaseName = file.filename.split('/')[-1]
        fileFullPath = '{0}{1}/{2}'.format(panda_config.cache_dir,jediLogDir,fileBaseName)
        # delete old file 
        if os.path.exists(fileFullPath):
            os.remove(fileFullPath)
        # write
        fo = open(fileFullPath,'wb')
        fileContent = file.file.read()
        fo.write(fileContent)
        fo.close()
        tmpLog.debug("written to {0}".format(fileFullPath))
        retStr = 'http://{0}/cache{1}/{2}'.format(getServerHTTP(None),jediLogDir,fileBaseName) 
    except:
        errtype,errvalue = sys.exc_info()[:2]
        errStr = "failed to write log with {0}:{1}".format(errtype.__name__,errvalue)
        tmpLog.error(errStr)
        tmpLog.debug("end")
        return errStr
    tmpLog.debug("end")
    return retStr
Пример #30
0
def getFilesFromLRC(files,
                    url,
                    guids=[],
                    storageName=[],
                    terminateWhenFailed=False,
                    getPFN=False,
                    scopeList=[]):
    tmpLog = LogWrapper(_log, None)
    tmpLog.debug('getFilesFromLRC "%s" %s' % (url, str(storageName)))
    # get PFC
    outSTR = ''
    if url.startswith('mysql://'):
        # from MySQL
        outSTR = _getPFNFromMySQL(files, url)
        # get PFN
        if getPFN:
            outPFN = {}
            # FIXME
            tmpLog.debug('RetPFN:%s ' % str(outPFN))
            return outPFN
    elif url.startswith('http://'):
        # from HTTP I/F
        outSTR = _getPoolFileCatalog(files, url)
        # get PFN
        if getPFN:
            outPFN = {}
            try:
                if not outSTR in ['', None]:
                    root = xml.dom.minidom.parseString(outSTR)
                    fileNodes = root.getElementsByTagName('File')
                    for file in fileNodes:
                        # get PFN and LFN nodes
                        physical = file.getElementsByTagName('physical')[0]
                        pfnNode = physical.getElementsByTagName('pfn')[0]
                        logical = file.getElementsByTagName('logical')[0]
                        lfnNode = logical.getElementsByTagName('lfn')[0]
                        # convert UTF8 to Raw
                        pfn = str(pfnNode.getAttribute('name'))
                        lfn = str(lfnNode.getAttribute('name'))
                        # assign
                        if not outPFN.has_key(lfn):
                            outPFN[lfn] = []
                        outPFN[lfn].append(pfn)
            except:
                type, value, traceBack = sys.exc_info()
                tmpLog.error(outSTR)
                tmpLog.error("could not parse XML - %s %s" % (type, value))
            tmpLog.debug('RetPFN:%s ' % str(outPFN))
            return outPFN
    elif url.startswith('lfc://') or url.startswith('rucio://'):
        # from LFC
        timeStart = datetime.datetime.utcnow()
        outSTR = _getPFNFromLFC(files,
                                url,
                                guids,
                                storageName,
                                scopeList=scopeList,
                                tmpLog=tmpLog)
        regTime = datetime.datetime.utcnow() - timeStart
        tmpLog.debug(
            'file lookup for %s LFNs from %s took %s.%03d sec' %
            (len(files), url, regTime.seconds, regTime.microseconds / 1000))
        # get PFN
        if getPFN:
            outPFN = {}
            try:
                if not outSTR in ['', None]:
                    tmpItems = outSTR.split('LFCRet :')
                    tmpItems.remove('')
                    # loop over all returns
                    for tmpItem in tmpItems:
                        exec "tmpLFNmap = %s" % tmpItem
                        for tmpLFN, tmpPFN in tmpLFNmap.iteritems():
                            outPFN[tmpLFN] = tmpPFN
            except:
                type, value, traceBack = sys.exc_info()
                tmpLog.error(outSTR)
                tmpLog.error("could not parse LFC ret - %s %s" % (type, value))
            tmpLog.debug('RetPFN:%s files' % len(outPFN))
            return outPFN
    # check return
    if not isinstance(outSTR, types.StringType):
        if terminateWhenFailed:
            return None
        # set empty string
        outSTR = ''
    # collect OK Files
    okFiles = []
    for file in files:
        if re.search(file, outSTR) != None:
            okFiles.append(file)
    tmpLog.debug('Ret:%s / %s files' % (str(okFiles[:3]), len(okFiles)))
    return okFiles
Пример #31
0
def uploadLog(req, file):
    if not Protocol.isSecure(req):
        return False
    if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']:
        return False
    tmpLog = LogWrapper(_logger, 'uploadLog <{0}>'.format(file.filename))
    tmpLog.debug("start {0}".format(req.subprocess_env['SSL_CLIENT_S_DN']))
    # size check
    sizeLimit = 100 * 1024 * 1024
    # get file size
    contentLength = 0
    try:
        contentLength = long(req.headers_in["content-length"])
    except:
        if req.headers_in.has_key("content-length"):
            tmpLog.error("cannot get CL : %s" %
                         req.headers_in["content-length"])
        else:
            tmpLog.error("no CL")
    tmpLog.debug("size %s" % contentLength)
    if contentLength > sizeLimit:
        errStr = "failed to upload log due to size limit"
        tmpLog.error(errStr)
        tmpLog.debug("end")
        return errStr
    jediLogDir = '/jedilog'
    retStr = ''
    try:
        fileBaseName = file.filename.split('/')[-1]
        fileFullPath = '{0}{1}/{2}'.format(panda_config.cache_dir, jediLogDir,
                                           fileBaseName)
        # delete old file
        if os.path.exists(fileFullPath):
            os.remove(fileFullPath)
        # write
        fo = open(fileFullPath, 'wb')
        fileContent = file.file.read()
        fo.write(fileContent)
        fo.close()
        tmpLog.debug("written to {0}".format(fileFullPath))
        retStr = 'http://{0}/cache{1}/{2}'.format(getServerHTTP(None),
                                                  jediLogDir, fileBaseName)
    except:
        errtype, errvalue = sys.exc_info()[:2]
        errStr = "failed to write log with {0}:{1}".format(
            errtype.__name__, errvalue)
        tmpLog.error(errStr)
        tmpLog.debug("end")
        return errStr
    tmpLog.debug("end")
    return retStr
Пример #32
0
 def getGUIDsFromEventIndex(self,runEventList,streamName,amiTags,dataType):
     comment = ' /* DBProxy.getGUIDsFromEventIndex */'
     methodName = comment.split(' ')[-2].split('.')[-1]
     tmpLog = LogWrapper(_logger,methodName+" <streamName={0} amiTags={1} dataType={2}>".format(streamName,amiTags,dataType))
     try:
         # change to list
         if not amiTags in [None,'']:
             amiTags = amiTags.replace('*','.*').split(',')
         tmpLog.debug("start for {0} events".format(len(runEventList)))
         # check data type
         if not dataType in ['RAW','ESD','AOD']:
             return False,'dataType={0} is unsupported'.format(dataType)
         # sql to insert runs and events
         sqlRE  = "INSERT INTO {0}.TMP_RUN_EVENT_PAIRS (runNumber,eventNumber) ".format(panda_config.schemaEI)
         sqlRE += "VALUES (:runNumber,:eventNumber) "
         varMaps = []
         for runNumber,eventNumber in runEventList:
             varMap = {}
             varMap[':runNumber'] = runNumber
             varMap[':eventNumber'] = eventNumber
             varMaps.append(varMap)
         # begin transaction
         self.conn.begin()
         self.cur.arraysize = 100000
         # insert runs and events
         self.cur.executemany(sqlRE+comment, varMaps)
         # read GUIDs
         varMap = {}
         if amiTags in [None,'']:
             sqlRG  = "SELECT runNumber,eventNumber,guid_{0} ".format(dataType)
             sqlRG += "FROM {0}.V_PANDA_EVPICK_NOAMITAG_MANY ".format(panda_config.schemaEI)
         else:
             sqlRG  = "SELECT runNumber,eventNumber,guid_{0},amiTag ".format(dataType)
             sqlRG += "FROM {0}.V_PANDA_EVPICK_AMITAG_MANY ".format(panda_config.schemaEI)
         if not streamName in [None,'']:
             sqlRG += "WHERE streamName=:streamName "
             varMap[':streamName'] = streamName
         self.cur.execute(sqlRG+comment, varMap)
         resRG = self.cur.fetchall()
         # commit
         if not self._commit():
             raise RuntimeError, 'Commit error'
         retValue = {}
         keyAmiIdxMap = {}
         for tmpItem in resRG:
             if amiTags in [None,'']:
                 runNumber,eventNumber,guid = tmpItem
                 # dummy
                 idxTag = 0
             else:
                 runNumber,eventNumber,guid,amiTag = tmpItem
                 # get index number for the AMI tag in the list
                 idxTag = self.getIndexAmiTag(amiTags,amiTag)
                 # didn't match
                 if idxTag == None:
                     continue
             tmpKey = (runNumber,eventNumber)
             # use AMI tag in a preference orde
             if tmpKey in keyAmiIdxMap and keyAmiIdxMap[tmpKey] < idxTag:
                 continue
             keyAmiIdxMap[tmpKey] = idxTag
             retValue[tmpKey] = [guid]
         tmpLog.debug("found {0} events".format(len(retValue)))
         return True,retValue
     except:
         # roll back
         self._rollback()
         # error
         self.dumpErrorMessage(_logger,methodName)
         return False,None
Пример #33
0
import re
from config import panda_config

from taskbuffer.TaskBuffer import taskBuffer

from pandalogger.PandaLogger import PandaLogger
from pandalogger.LogWrapper import LogWrapper

import panda_proxy_cache

# logger
_logger = PandaLogger().getLogger('panda_activeusers_query')
tmpLog = LogWrapper(_logger)

if __name__ == '__main__':

    tmpLog.debug("================= start ==================")
    # instantiate TB
    taskBuffer.init(panda_config.dbhost,
                    panda_config.dbpasswd,
                    nDBConnection=1)

    # instantiate MyProxy I/F
    my_proxy_interface_instance = panda_proxy_cache.MyProxyInterface()

    # roles
    if hasattr(panda_config, 'proxy_cache_roles'):
        roles = panda_config.proxy_cache_roles.split(',')
    else:
        roles = [
            'atlas', 'atlas:/atlas/Role=production', 'atlas:/atlas/Role=pilot'
Пример #34
0
import sys
import datetime
import traceback
from taskbuffer.TaskBuffer import taskBuffer
from pandalogger.PandaLogger import PandaLogger
from pandalogger.LogWrapper import LogWrapper
from brokerage.SiteMapper import SiteMapper


# password
from config import panda_config
passwd = panda_config.dbpasswd

# logger
_logger = PandaLogger().getLogger('prioryMassage')
tmpLog = LogWrapper(_logger)


tmpLog.debug("================= start ==================")

# instantiate TB
taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)

# instantiate sitemapper
siteMapper = SiteMapper(taskBuffer)

# get usage breakdown
usageBreakDownPerUser = {}
usageBreakDownPerSite = {}
workingGroupList = []
for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']:
Пример #35
0
import multiprocessing
from taskbuffer.TaskBuffer import taskBuffer
import pandalogger.PandaLogger
from pandalogger.PandaLogger import PandaLogger
from brokerage.SiteMapper import SiteMapper
from pandautils import PandaUtils
from pandalogger.LogWrapper import LogWrapper

# password
from config import panda_config
passwd = panda_config.dbpasswd

# logger
_logger = PandaLogger().getLogger('add')

tmpLog = LogWrapper(_logger, None)

tmpLog.debug("===================== start =====================")

# overall timeout value
overallTimeout = 20

# grace period
try:
    gracePeriod = int(sys.argv[1])
except:
    gracePeriod = 3

# current minute
currentMinute = datetime.datetime.utcnow().minute
Пример #36
0
import sys
import datetime
import traceback
from taskbuffer.TaskBuffer import taskBuffer
from pandalogger.PandaLogger import PandaLogger
from pandalogger.LogWrapper import LogWrapper
from brokerage.SiteMapper import SiteMapper
from taskbuffer import ErrorCode

# password
from config import panda_config
passwd = panda_config.dbpasswd

# logger
_logger = PandaLogger().getLogger('esPreemption')
tmpLog = LogWrapper(_logger)


tmpLog.debug("================= start ==================")

# instantiate TB
taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)

# instantiate sitemapper
siteMapper = SiteMapper(taskBuffer)

# time limit
timeLimit = datetime.datetime.utcnow()-datetime.timedelta(minutes=15)

# get low priority ES jobs per site
sqlEsJobs  = "SELECT PandaID,computingSite,commandToPilot,startTime "