def main(backGround=False):
    _logger.debug('starting ...')
    # register signal handler
    signal.signal(signal.SIGINT, catch_sig)
    signal.signal(signal.SIGHUP, catch_sig)
    signal.signal(signal.SIGTERM, catch_sig)
    signal.signal(signal.SIGALRM, catch_sig)
    signal.alarm(overallTimeout)
    # forking
    pid = os.fork()
    if pid != 0:
        # watch child process
        os.wait()
        time.sleep(1)
    else:
        # main loop
        from taskbuffer.TaskBuffer import taskBuffer
        # initialize cx_Oracle using dummy connection
        from taskbuffer.Initializer import initializer
        initializer.init()
        # instantiate TB
        taskBuffer.init(panda_config.dbhost,
                        panda_config.dbpasswd,
                        nDBConnection=1)
        # instantiate sitemapper
        siteMapper = SiteMapper(taskBuffer)
        # ActiveMQ params
        clientid = 'PANDA-' + socket.getfqdn()
        queue = '/queue/Consumer.test1.poc.pocMSG'
        ssl_opts = {
            'use_ssl': True,
            'ssl_cert_file': '%s/hostcert.pem' % panda_config.certdir,
            'ssl_key_file': '%s/hostkey.pem' % panda_config.certdir
        }
        # resolve multiple brokers
        brokerList = socket.gethostbyname_ex('gridmsg007.cern.ch')[-1]
        # set listener
        for tmpBroker in brokerList:
            try:
                _logger.debug('setting listener on %s' % tmpBroker)
                conn = stomp.Connection(host_and_ports=[(tmpBroker, 6162)],
                                        **ssl_opts)
                conn.set_listener(
                    'GenCallbackConsumer',
                    GenCallbackConsumer(conn, taskBuffer, siteMapper))
                conn.start()
                conn.connect(headers={'client-id': clientid})
                conn.subscribe(destination=queue, ack='client-individual')
                #,headers = {'selector':"cbtype='FileDoneMessage'"})
                if not conn.is_connected():
                    _logger.error("connection failure to %s" % tmpBroker)
            except:
                errtype, errvalue = sys.exc_info()[:2]
                _logger.error("failed to set listener on %s : %s %s" %
                              (tmpBroker, errtype, errvalue))
                catch_sig(None, None)
Exemplo n.º 2
0
def main(backGround=False):
    _logger.debug('starting ...')
    # register signal handler
    signal.signal(signal.SIGINT, catch_sig)
    signal.signal(signal.SIGHUP, catch_sig)
    signal.signal(signal.SIGTERM, catch_sig)
    signal.signal(signal.SIGALRM, catch_sig)
    signal.alarm(overallTimeout)
    # forking
    pid = os.fork()
    if pid != 0:
        # watch child process
        os.wait()
        time.sleep(1)
    else:
        # main loop
        from taskbuffer.TaskBuffer import taskBuffer
        # check certificate
        certName = '%s/pandasv1_usercert.pem' % panda_config.certdir
        keyName = '%s/pandasv1_userkey.pem' % panda_config.certdir

        _logger.debug('checking certificate {0}'.format(certName))
        certOK, certMsg = DataServiceUtils.checkCertificate(certName)
        if not certOK:
            _logger.error('bad certificate : {0}'.format(certMsg))
        # initialize cx_Oracle using dummy connection
        from taskbuffer.Initializer import initializer
        initializer.init()
        # instantiate TB
        taskBuffer.init(panda_config.dbhost,
                        panda_config.dbpasswd,
                        nDBConnection=1)
        # instantiate sitemapper
        siteMapper = SiteMapper(taskBuffer)
        # ActiveMQ params
        queue = '/queue/Consumer.PANDA.atlas.ddm.siteservices'
        ssl_opts = {
            'use_ssl': True,
            'ssl_version': ssl.PROTOCOL_TLSv1,
            'ssl_cert_file': certName,
            'ssl_key_file': keyName
        }
        # resolve multiple brokers
        brokerList = socket.gethostbyname_ex('atlas-mb.cern.ch')[-1]
        # set listener
        connList = []
        for tmpBroker in brokerList:
            try:
                clientid = 'PANDA-' + socket.getfqdn() + '-' + tmpBroker
                subscription_id = 'panda-server-consumer-' + socket.getfqdn()
                _logger.debug('setting listener %s' % clientid)
                conn = stomp.Connection(host_and_ports=[(tmpBroker, 61023)],
                                        **ssl_opts)
                connList.append(conn)
            except:
                errtype, errvalue = sys.exc_info()[:2]
                _logger.error("failed to connect to %s : %s %s" %
                              (tmpBroker, errtype, errvalue))
                catch_sig(None, None)
        while True:
            for conn in connList:
                try:
                    if not conn.is_connected():
                        conn.set_listener(
                            'FileCallbackListener',
                            FileCallbackListener(conn, taskBuffer, siteMapper,
                                                 subscription_id))
                        conn.start()
                        conn.connect(headers={'client-id': clientid})
                        conn.subscribe(destination=queue,
                                       id=subscription_id,
                                       ack='client-individual')
                        _logger.debug('listener %s is up and running' %
                                      clientid)
                except:
                    errtype, errvalue = sys.exc_info()[:2]
                    _logger.error("failed to set listener on %s : %s %s" %
                                  (tmpBroker, errtype, errvalue))
                    catch_sig(None, None)
            time.sleep(5)
Exemplo n.º 3
0
 def run(self):
     # start
     try:
         byCallback = False
         if self.job == None:
             byCallback = True
             _logger.debug("start: %s" % self.dataset.name)
             _logger.debug("callback from %s" % self.site)
             # FIXME when callback from BNLPANDA disappeared
             if self.site == 'BNLPANDA':
                 self.site = 'BNL-OSG2_ATLASMCDISK'
             # instantiate site mapper
             siteMapper = SiteMapper(self.taskBuffer)
             # get computingSite/destinationSE
             computingSite, destinationSE = self.taskBuffer.getDestSE(
                 self.dataset.name)
             if destinationSE == None:
                 # try to get computingSite/destinationSE from ARCH to delete sub
                 # even if no active jobs left
                 computingSite, destinationSE = self.taskBuffer.getDestSE(
                     self.dataset.name, True)
                 if destinationSE == None:
                     _logger.error("cannot get source/destination for %s" %
                                   self.dataset.name)
                     _logger.debug("end: %s" % self.dataset.name)
                     return
             _logger.debug("src: %s" % computingSite)
             _logger.debug("dst: %s" % destinationSE)
             # get corresponding token
             tmpSrcSiteSpec = siteMapper.getSite(computingSite)
             tmpDstSiteSpec = siteMapper.getSite(destinationSE)
             _logger.debug(tmpDstSiteSpec.setokens)
             destToken = None
             for tmpToken, tmpDdmId in tmpDstSiteSpec.setokens.iteritems():
                 if self.site == tmpDdmId:
                     destToken = tmpToken
                     break
             _logger.debug("use Token=%s" % destToken)
             # get required tokens
             reqTokens = self.taskBuffer.getDestTokens(self.dataset.name)
             if reqTokens == None:
                 _logger.error("cannot get required token for %s" %
                               self.dataset.name)
                 _logger.debug("end: %s" % self.dataset.name)
                 return
             _logger.debug("req Token=%s" % reqTokens)
             # make bitmap for the token
             bitMap = 1
             if len(reqTokens.split(',')) > 1:
                 for tmpReqToken in reqTokens.split(','):
                     if tmpReqToken == destToken:
                         break
                     # shift one bit
                     bitMap <<= 1
             # completed bitmap
             compBitMap = (1 << len(reqTokens.split(','))) - 1
             # ignore the lowest bit for T1, file on DISK is already there
             if tmpSrcSiteSpec.ddm == tmpDstSiteSpec.ddm:
                 compBitMap = compBitMap & 0xFFFE
             # update bitmap in DB
             updatedBitMap = self.taskBuffer.updateTransferStatus(
                 self.dataset.name, bitMap)
             _logger.debug(
                 "transfer status:%s - comp:%s - bit:%s" %
                 (hex(updatedBitMap), hex(compBitMap), hex(bitMap)))
             # update output files
             if (updatedBitMap & compBitMap) == compBitMap:
                 ids = self.taskBuffer.updateOutFilesReturnPandaIDs(
                     self.dataset.name)
                 # set flag for T2 cleanup
                 self.dataset.status = 'cleanup'
                 self.taskBuffer.updateDatasets([self.dataset])
             else:
                 _logger.debug("end: %s" % self.dataset.name)
                 return
         else:
             _logger.debug("start: %s" % self.job.PandaID)
             # update input files
             ids = [self.job.PandaID]
         _logger.debug("IDs: %s" % ids)
         if len(ids) != 0:
             # get job
             if self.job == None:
                 jobs = self.taskBuffer.peekJobs(ids,
                                                 fromDefined=False,
                                                 fromArchived=False,
                                                 fromWaiting=False)
             else:
                 jobs = [self.job]
             # loop over all jobs
             for job in jobs:
                 if job == None:
                     continue
                 _logger.debug("Job: %s" % job.PandaID)
                 if job.jobStatus == 'transferring':
                     jobReady = True
                     failedFiles = []
                     noOutFiles = []
                     # check file status
                     for file in job.Files:
                         if file.type == 'output' or file.type == 'log':
                             if file.status == 'failed':
                                 failedFiles.append(file.lfn)
                             elif file.status == 'nooutput':
                                 noOutFiles.append(file.lfn)
                             elif file.status != 'ready':
                                 _logger.debug(
                                     "Job: %s file:%s %s != ready" %
                                     (job.PandaID, file.lfn, file.status))
                                 jobReady = False
                                 break
                     # finish job
                     if jobReady:
                         if byCallback:
                             _logger.debug("Job: %s all files ready" %
                                           job.PandaID)
                         else:
                             _logger.debug(
                                 "Job: %s all files checked with catalog" %
                                 job.PandaID)
                         # create XML
                         try:
                             import xml.dom.minidom
                             dom = xml.dom.minidom.getDOMImplementation()
                             doc = dom.createDocument(None, 'xml', None)
                             topNode = doc.createElement("POOLFILECATALOG")
                             for file in job.Files:
                                 if file.type in ['output', 'log']:
                                     # skip failed or no-output files
                                     if file.lfn in failedFiles + noOutFiles:
                                         continue
                                     # File
                                     fileNode = doc.createElement("File")
                                     fileNode.setAttribute("ID", file.GUID)
                                     # LFN
                                     logNode = doc.createElement("logical")
                                     lfnNode = doc.createElement("lfn")
                                     lfnNode.setAttribute('name', file.lfn)
                                     # metadata
                                     fsizeNode = doc.createElement(
                                         "metadata")
                                     fsizeNode.setAttribute(
                                         "att_name", "fsize")
                                     fsizeNode.setAttribute(
                                         "att_value", str(file.fsize))
                                     # checksum
                                     if file.checksum.startswith('ad:'):
                                         # adler32
                                         chksumNode = doc.createElement(
                                             "metadata")
                                         chksumNode.setAttribute(
                                             "att_name", "adler32")
                                         chksumNode.setAttribute(
                                             "att_value",
                                             re.sub('^ad:', '',
                                                    file.checksum))
                                     else:
                                         # md5sum
                                         chksumNode = doc.createElement(
                                             "metadata")
                                         chksumNode.setAttribute(
                                             "att_name", "md5sum")
                                         chksumNode.setAttribute(
                                             "att_value",
                                             re.sub('^md5:', '',
                                                    file.checksum))
                                     # append nodes
                                     logNode.appendChild(lfnNode)
                                     fileNode.appendChild(logNode)
                                     fileNode.appendChild(fsizeNode)
                                     fileNode.appendChild(chksumNode)
                                     topNode.appendChild(fileNode)
                             # status in file name
                             if failedFiles == []:
                                 statusFileName = 'finished'
                             else:
                                 statusFileName = 'failed'
                             # write to file
                             xmlFile = '%s/%s_%s_%s' % (
                                 panda_config.logdir, job.PandaID,
                                 statusFileName,
                                 commands.getoutput('uuidgen'))
                             oXML = open(xmlFile, "w")
                             oXML.write(topNode.toxml())
                             oXML.close()
                         except:
                             type, value, traceBack = sys.exc_info()
                             _logger.error("%s : %s %s" %
                                           (job.PandaID, type, value))
                 _logger.debug("Job: %s status: %s" %
                               (job.PandaID, job.jobStatus))
         # end
         if self.job == None:
             _logger.debug("end: %s" % self.dataset.name)
         else:
             _logger.debug("end: %s" % self.job.PandaID)
     except:
         type, value, traceBack = sys.exc_info()
         _logger.error("run() : %s %s" % (type, value))
Exemplo n.º 4
0
 def getSiteMapper(self):
     return SiteMapper(self.taskBuffer)
Exemplo n.º 5
0
        startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6])
        # kill old process
        if startTime < timeLimit:
            _logger.debug("old process : %s %s" % (pid,startTime))
            _logger.debug(line)            
            commands.getoutput('kill -9 %s' % pid)
except:
    type, value, traceBack = sys.exc_info()
    _logger.error("kill process : %s %s" % (type,value))
    

# instantiate TB
taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)

# instantiate sitemapper
siteMapper = SiteMapper(taskBuffer)

_memoryCheck("rebroker")

# rebrokerage
_logger.debug("Rebrokerage start")

# get timeout value
timeoutVal = taskBuffer.getConfigValue('rebroker','ANALY_TIMEOUT')
if timeoutVal is None:
    timeoutVal = 12
_logger.debug("timeout value : {0}h".format(timeoutVal))    
try:
    normalTimeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=timeoutVal)
    sortTimeLimit   = datetime.datetime.utcnow() - datetime.timedelta(hours=3)
    sql  = "SELECT jobDefinitionID,prodUserName,prodUserID,computingSite,MAX(modificationTime),jediTaskID,processingType "
Exemplo n.º 6
0
# password
from config import panda_config
passwd = panda_config.dbpasswd

# logger
_logger = PandaLogger().getLogger('prioryMassage')
tmpLog = LogWrapper(_logger)

tmpLog.debug("================= start ==================")

# instantiate TB
taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)

# instantiate sitemapper
siteMapper = SiteMapper(taskBuffer)

# get usage breakdown
usageBreakDownPerUser = {}
usageBreakDownPerSite = {}
workingGroupList = []
for table in ['ATLAS_PANDA.jobsActive4', 'ATLAS_PANDA.jobsArchived4']:
    varMap = {}
    varMap[':prodSourceLabel'] = 'user'
    if table == 'ATLAS_PANDA.jobsActive4':
        sql = "SELECT COUNT(*),prodUserName,jobStatus,workingGroup,computingSite FROM %s WHERE prodSourceLabel=:prodSourceLabel GROUP BY prodUserName,jobStatus,workingGroup,computingSite" % table
    else:
        # with time range for archived table
        varMap[':modificationTime'] = datetime.datetime.utcnow(
        ) - datetime.timedelta(minutes=60)
        sql = "SELECT COUNT(*),prodUserName,jobStatus,workingGroup,computingSite FROM %s WHERE prodSourceLabel=:prodSourceLabel AND modificationTime>:modificationTime GROUP BY prodUserName,jobStatus,workingGroup,computingSite" % table
Exemplo n.º 7
0
 def run(self):
     try:
         # get job
         tmpJobs = self.taskBuffer.getFullJobStatus([self.rPandaID])
         if tmpJobs == [] or tmpJobs[0] == None:
             _logger.debug("cannot find job for PandaID=%s" % self.rPandaID)
             return
         self.job = tmpJobs[0]
         _logger.debug("%s start %s:%s:%s" % (self.token,self.job.jobDefinitionID,self.job.prodUserName,self.job.computingSite))
         # using output container
         if not self.job.destinationDBlock.endswith('/'):
             _logger.debug("%s ouput dataset container is required" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # FIXEME : dont' touch group jobs for now
         if self.job.destinationDBlock.startswith('group') and (not self.userRequest):
             _logger.debug("%s skip group jobs" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # check processingType
         typesForRebro = ['pathena','prun','ganga','ganga-rbtest']
         if not self.job.processingType in typesForRebro:
             _logger.debug("%s skip processingType=%s not in %s" % \
                           (self.token,self.job.processingType,str(typesForRebro)))
             _logger.debug("%s end" % self.token)
             return
         # check jobsetID
         if self.job.jobsetID in [0,'NULL',None]:
             _logger.debug("%s jobsetID is undefined" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # check metadata 
         if self.job.metadata in [None,'NULL']:
             _logger.debug("%s metadata is unavailable" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # check --disableRebrokerage
         match = re.search("--disableRebrokerage",self.job.metadata)
         if match != None and (not self.simulation) and (not self.forceOpt) \
                and (not self.userRequest):
             _logger.debug("%s diabled rebrokerage" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # check --site
         match = re.search("--site",self.job.metadata)
         if match != None and (not self.simulation) and (not self.forceOpt) \
                and (not self.userRequest):
             _logger.debug("%s --site is used" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # check --libDS
         match = re.search("--libDS",self.job.metadata)
         if match != None:
             _logger.debug("%s --libDS is used" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # check --workingGroup since it is site-specific 
         match = re.search("--workingGroup",self.job.metadata)
         if match != None:
             _logger.debug("%s workingGroup is specified" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # avoid too many rebrokerage
         if not self.checkRev():
             _logger.debug("%s avoid too many rebrokerage" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # check if multiple JobIDs use the same libDS
         if self.bPandaID != None and self.buildStatus not in ['finished','failed']:
             if self.minPandaIDlibDS == None or self.maxPandaIDlibDS == None:
                 _logger.debug("%s max/min PandaIDs are unavailable for the libDS" % self.token)
                 _logger.debug("%s end" % self.token)
                 return
             tmpPandaIDsForLibDS = self.taskBuffer.getFullJobStatus([self.minPandaIDlibDS,self.maxPandaIDlibDS])
             if len(tmpPandaIDsForLibDS) != 2 or tmpPandaIDsForLibDS[0] == None or tmpPandaIDsForLibDS[1] == None:
                 _logger.debug("%s failed to get max/min PandaIDs for the libDS" % self.token)
                 _logger.debug("%s end" % self.token)
                 return
             # check
             if tmpPandaIDsForLibDS[0].jobDefinitionID != tmpPandaIDsForLibDS[1].jobDefinitionID:
                 _logger.debug("%s multiple JobIDs use the libDS %s:%s %s:%s" % (self.token,tmpPandaIDsForLibDS[0].jobDefinitionID,
                                                                                 self.minPandaIDlibDS,tmpPandaIDsForLibDS[1].jobDefinitionID,
                                                                                 self.maxPandaIDlibDS))
                 _logger.debug("%s end" % self.token)
                 return
         # check excludedSite
         if self.excludedSite == None:
             self.excludedSite = []
             match = re.search("--excludedSite( +|=)\s*(\'|\")*([^ \"\';$]+)",self.job.metadata)
             if match != None:
                 self.excludedSite = match.group(3).split(',')
         # remove empty
         try:
             self.excludedSite.remove('')
         except:
             pass
         _logger.debug("%s excludedSite=%s" % (self.token,str(self.excludedSite)))
         # check cloud
         if self.cloud == None:
             match = re.search("--cloud( +|=)\s*(\'|\")*([^ \"\';$]+)",self.job.metadata)
             if match != None:
                 self.cloud = match.group(3)
         _logger.debug("%s cloud=%s" % (self.token,self.cloud))
         # get inDS/LFNs
         status,tmpMapInDS,maxFileSize = self.taskBuffer.getInDatasetsForReBrokerage(self.jobID,self.userName)
         if not status:
             # failed
             _logger.error("%s failed to get inDS/LFN from DB" % self.token)
             return
         status,inputDS = self.getListDatasetsUsedByJob(tmpMapInDS)
         if not status:
             # failed
             _logger.error("%s failed" % self.token)
             return 
         # get relicas
         replicaMap = {}
         unknownSites = {} 
         for tmpDS in inputDS:
             if tmpDS.endswith('/'):
                 # container
                 status,tmpRepMaps = self.getListDatasetReplicasInContainer(tmpDS)
             else:
                 # normal dataset
                 status,tmpRepMap = self.getListDatasetReplicas(tmpDS)
                 tmpRepMaps = {tmpDS:tmpRepMap}
             if not status:
                 # failed
                 _logger.debug("%s failed" % self.token)
                 return 
             # make map per site
             for tmpDS,tmpRepMap in tmpRepMaps.iteritems():
                 for tmpSite,tmpStat in tmpRepMap.iteritems():
                     # ignore special sites
                     if tmpSite in ['CERN-PROD_TZERO','CERN-PROD_DAQ','CERN-PROD_TMPDISK']:
                         continue
                     # ignore tape sites
                     if tmpSite.endswith('TAPE'):
                         continue
                     # keep sites with unknown replica info 
                     if tmpStat[-1]['found'] == None:
                         if not unknownSites.has_key(tmpDS):
                             unknownSites[tmpDS] = []
                         unknownSites[tmpDS].append(tmpSite)
                     # ignore ToBeDeleted
                     if tmpStat[-1]['archived'] in ['ToBeDeleted',]:
                         continue
                     # change EOS
                     if tmpSite.startswith('CERN-PROD_EOS'):
                         tmpSite = 'CERN-PROD_EOS'
                     # change EOS TMP
                     if tmpSite.startswith('CERN-PROD_TMP'):
                         tmpSite = 'CERN-PROD_TMP'
                     # change DISK to SCRATCHDISK
                     tmpSite = re.sub('_[^_-]+DISK$','',tmpSite)
                     # change PERF-XYZ to SCRATCHDISK
                     tmpSite = re.sub('_PERF-[^_-]+$','',tmpSite)
                     # change PHYS-XYZ to SCRATCHDISK
                     tmpSite = re.sub('_PHYS-[^_-]+$','',tmpSite)
                     # patch for BNLPANDA
                     if tmpSite in ['BNLPANDA']:
                         tmpSite = 'BNL-OSG2'
                     # add to map    
                     if not replicaMap.has_key(tmpSite):
                         replicaMap[tmpSite] = {}
                     replicaMap[tmpSite][tmpDS] = tmpStat[-1]
         _logger.debug("%s replica map -> %s" % (self.token,str(replicaMap)))
         # refresh replica info in needed
         self.refreshReplicaInfo(unknownSites)
         # instantiate SiteMapper
         siteMapper = SiteMapper(self.taskBuffer)
         # get original DDM
         origSiteDDM = self.getAggName(siteMapper.getSite(self.job.computingSite).ddm)
         # check all datasets
         maxDQ2Sites = []
         if inputDS != []:
             # loop over all sites
             for tmpSite,tmpDsVal in replicaMap.iteritems():
                 # loop over all datasets
                 appendFlag = True
                 for tmpOrigDS in inputDS:
                     # check completeness
                     if tmpDsVal.has_key(tmpOrigDS) and tmpDsVal[tmpOrigDS]['found'] != None and \
                            tmpDsVal[tmpOrigDS]['total'] == tmpDsVal[tmpOrigDS]['found']:
                         pass
                     else:
                         appendFlag = False
                 # append
                 if appendFlag:
                     if not tmpSite in maxDQ2Sites:
                         maxDQ2Sites.append(tmpSite)
         _logger.debug("%s candidate DQ2s -> %s" % (self.token,str(maxDQ2Sites)))
         if inputDS != [] and maxDQ2Sites == []:
             _logger.debug("%s no DQ2 candidate" % self.token)
         else:
             maxPandaSites = []
             # original maxinputsize
             origMaxInputSize = siteMapper.getSite(self.job.computingSite).maxinputsize
             # look for Panda siteIDs
             for tmpSiteID,tmpSiteSpec in siteMapper.siteSpecList.iteritems():
                 # use ANALY_ only
                 if not tmpSiteID.startswith('ANALY_'):
                     continue
                 # remove test and local
                 if re.search('_test',tmpSiteID,re.I) != None:
                     continue
                 if re.search('_local',tmpSiteID,re.I) != None:
                     continue
                 # avoid same site
                 if self.avoidSameSite and self.getAggName(tmpSiteSpec.ddm) == origSiteDDM:
                     continue
                 # check DQ2 ID
                 if self.cloud in [None,tmpSiteSpec.cloud] \
                        and (self.getAggName(tmpSiteSpec.ddm) in maxDQ2Sites or inputDS == []):
                     # excluded sites
                     excludedFlag = False
                     for tmpExcSite in self.excludedSite:
                         if re.search(tmpExcSite,tmpSiteID) != None:
                             excludedFlag = True
                             break
                     if excludedFlag:
                         _logger.debug("%s skip %s since excluded" % (self.token,tmpSiteID))
                         continue
                     # use online only
                     if tmpSiteSpec.status != 'online':
                         _logger.debug("%s skip %s status=%s" % (self.token,tmpSiteID,tmpSiteSpec.status))
                         continue
                     # check maxinputsize
                     if (maxFileSize == None and origMaxInputSize > siteMapper.getSite(tmpSiteID).maxinputsize) or \
                            maxFileSize > siteMapper.getSite(tmpSiteID).maxinputsize:
                         _logger.debug("%s skip %s due to maxinputsize" % (self.token,tmpSiteID))
                         continue
                     # append
                     if not tmpSiteID in maxPandaSites:
                         maxPandaSites.append(tmpSiteID)
             # choose at most 20 sites randomly to avoid too many lookup            
             random.shuffle(maxPandaSites)
             maxPandaSites = maxPandaSites[:20]
             _logger.debug("%s candidate PandaSites -> %s" % (self.token,str(maxPandaSites)))
             # no Panda siteIDs            
             if maxPandaSites == []:            
                 _logger.debug("%s no Panda site candidate" % self.token)
             else:
                 # set AtlasRelease and cmtConfig to dummy job
                 tmpJobForBrokerage = JobSpec()
                 if self.job.AtlasRelease in ['NULL',None]:
                     tmpJobForBrokerage.AtlasRelease = ''
                 else:
                     tmpJobForBrokerage.AtlasRelease = self.job.AtlasRelease
                 # use nightlies
                 matchNight = re.search('^AnalysisTransforms-.*_(rel_\d+)$',self.job.homepackage)
                 if matchNight != None:
                     tmpJobForBrokerage.AtlasRelease += ':%s' % matchNight.group(1)
                 # use cache
                 else:
                     matchCache = re.search('^AnalysisTransforms-([^/]+)',self.job.homepackage)
                     if matchCache != None:
                         tmpJobForBrokerage.AtlasRelease = matchCache.group(1).replace('_','-')
                 if not self.job.cmtConfig in ['NULL',None]:    
                     tmpJobForBrokerage.cmtConfig = self.job.cmtConfig
                 # memory size
                 if not self.job.minRamCount in ['NULL',None,0]:
                     tmpJobForBrokerage.minRamCount = self.job.minRamCount
                 # CPU count
                 if not self.job.maxCpuCount in ['NULL',None,0]:
                     tmpJobForBrokerage.maxCpuCount = self.job.maxCpuCount
                 # run brokerage
                 brokerage.broker.schedule([tmpJobForBrokerage],self.taskBuffer,siteMapper,forAnalysis=True,
                                           setScanSiteList=maxPandaSites,trustIS=True,reportLog=True)
                 newSiteID = tmpJobForBrokerage.computingSite
                 self.brokerageInfo += tmpJobForBrokerage.brokerageErrorDiag
                 _logger.debug("%s runBrokerage - > %s" % (self.token,newSiteID))
                 # unknown site
                 if not siteMapper.checkSite(newSiteID):
                     _logger.error("%s unknown site" % self.token)
                     _logger.debug("%s failed" % self.token)
                     return 
                 # get new site spec
                 newSiteSpec = siteMapper.getSite(newSiteID)
                 # avoid repetition
                 if self.getAggName(newSiteSpec.ddm) == origSiteDDM:
                     _logger.debug("%s assigned to the same site %s " % (self.token,newSiteID))
                     _logger.debug("%s end" % self.token)                        
                     return
                 # simulation mode
                 if self.simulation:
                     _logger.debug("%s end simulation" % self.token)                        
                     return
                 # prepare jobs
                 status = self.prepareJob(newSiteID,newSiteSpec)
                 if status:
                     # run SetUpper
                     statusSetUp = self.runSetUpper()
                     if not statusSetUp:
                         _logger.debug("%s runSetUpper failed" % self.token)
                     else:
                         _logger.debug("%s successfully assigned to %s" % (self.token,newSiteID))
         _logger.debug("%s end" % self.token)
     except:
         errType,errValue,errTraceBack = sys.exc_info()
         _logger.error("%s run() : %s %s" % (self.token,errType,errValue))
Exemplo n.º 8
0
from brokerage.SiteMapper import SiteMapper

# password
from config import panda_config
passwd = panda_config.dbpasswd

# logger
_logger = PandaLogger().getLogger('shareMgr')

_logger.debug("================= start ==================")

# instantiate TB
taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)

# instantiate sitemapper
siteMapper = SiteMapper(taskBuffer)

# number of jobs to be activated per queue
nJobsPerQueue = 50

# priority threshold
prioCutoff = 950

# get high prio jobs without throttling
sql  = "SELECT distinct computingSite FROM ATLAS_PANDA.jobsActive4 "
sql += "WHERE jobStatus=:s1 AND prodSourceLabel IN (:p1) AND lockedBy=:lockedBy "
sql += "AND currentPriority>=:prioCutoff "
varMap = {}
varMap[':s1'] = 'throttled'
varMap[':p1'] = 'managed'
varMap[':lockedBy'] = 'jedi'
Exemplo n.º 9
0
# password
from config import panda_config
passwd = panda_config.dbpasswd

# logger
_logger = PandaLogger().getLogger('prioryMassage')
tmpLog = LogWrapper(_logger)


tmpLog.debug("================= start ==================")

# instantiate TB
taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)

# instantiate sitemapper
siteMapper = SiteMapper(taskBuffer)

# get usage breakdown
usageBreakDownPerUser = {}
usageBreakDownPerSite = {}
workingGroupList = []
for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']:
	varMap = {}
	varMap[':prodSourceLabel'] = 'user'
	varMap[':pmerge'] = 'pmerge'
	if table == 'ATLAS_PANDA.jobsActive4':
		sql = "SELECT COUNT(*),prodUserName,jobStatus,workingGroup,computingSite FROM %s WHERE prodSourceLabel=:prodSourceLabel AND processingType<>:pmerge GROUP BY prodUserName,jobStatus,workingGroup,computingSite" % table
	else:
		# with time range for archived table
		varMap[':modificationTime'] = datetime.datetime.utcnow() - datetime.timedelta(minutes=60)
		sql = "SELECT COUNT(*),prodUserName,jobStatus,workingGroup,computingSite FROM %s WHERE prodSourceLabel=:prodSourceLabel AND processingType<>:pmerge AND modificationTime>:modificationTime GROUP BY prodUserName,jobStatus,workingGroup,computingSite" % table
Exemplo n.º 10
0
 def run(self):
     # start
     try:
         byCallback = False
         if self.job == None:
             byCallback = True
             _logger.debug("start: %s" % self.dataset.name)
             _logger.debug("callback from %s" % self.site)
             # FIXME when callback from BNLPANDA disappeared
             if self.site == 'BNLPANDA':
                 self.site = 'BNL-OSG2_ATLASMCDISK'
             # instantiate site mapper
             siteMapper = SiteMapper(self.taskBuffer)
             # get computingSite/destinationSE
             computingSite,destinationSE = self.taskBuffer.getDestSE(self.dataset.name)
             if destinationSE == None:
                 # try to get computingSite/destinationSE from ARCH to delete sub
                 # even if no active jobs left 
                 computingSite,destinationSE = self.taskBuffer.getDestSE(self.dataset.name,True)
                 if destinationSE == None:
                     _logger.error("cannot get source/destination for %s" % self.dataset.name)
                     _logger.debug("end: %s" % self.dataset.name)                
                     return
             _logger.debug("src: %s" % computingSite)
             _logger.debug("dst: %s" % destinationSE)
             # get corresponding token
             tmpSrcSiteSpec = siteMapper.getSite(computingSite)
             tmpDstSiteSpec = siteMapper.getSite(destinationSE)
             _logger.debug(tmpDstSiteSpec.setokens_output)
             destToken = None
             for tmpToken,tmpDdmId in tmpDstSiteSpec.setokens_output.iteritems():
                 if self.site == tmpDdmId:
                     destToken = tmpToken
                     break
             _logger.debug("use Token=%s" % destToken)
             # get required tokens
             reqTokens = self.taskBuffer.getDestTokens(self.dataset.name)
             if reqTokens == None:
                 _logger.error("cannot get required token for %s" % self.dataset.name)
                 _logger.debug("end: %s" % self.dataset.name)                
                 return
             _logger.debug("req Token=%s" % reqTokens)
             # make bitmap for the token
             bitMap = 1
             if len(reqTokens.split(','))>1:
                 for tmpReqToken in reqTokens.split(','):
                     if tmpReqToken == destToken:
                         break
                     # shift one bit
                     bitMap <<= 1
             # completed bitmap
             compBitMap = (1 << len(reqTokens.split(',')))-1
             # ignore the lowest bit for T1, file on DISK is already there
             if tmpSrcSiteSpec.ddm_output == tmpDstSiteSpec.ddm_output:
                 compBitMap = compBitMap & 0xFFFE
             # update bitmap in DB
             updatedBitMap = self.taskBuffer.updateTransferStatus(self.dataset.name,bitMap)
             _logger.debug("transfer status:%s - comp:%s - bit:%s" % (hex(updatedBitMap),hex(compBitMap),hex(bitMap)))
             # update output files
             if (updatedBitMap & compBitMap) == compBitMap:
                 ids = self.taskBuffer.updateOutFilesReturnPandaIDs(self.dataset.name)
                 # set flag for T2 cleanup
                 self.dataset.status = 'cleanup'
                 self.taskBuffer.updateDatasets([self.dataset])
             else:
                 _logger.debug("end: %s" % self.dataset.name)
                 return
         else:
             _logger.debug("start: %s" % self.job.PandaID)
             # update input files
             ids = [self.job.PandaID]
         _logger.debug("IDs: %s" % ids)
         if len(ids) != 0:
             # get job
             if self.job == None:
                 jobs = self.taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False)
             else:
                 jobs = [self.job]
             # loop over all jobs
             for job in jobs:
                 if job == None:
                     continue
                 _logger.debug("Job: %s" % job.PandaID)
                 if job.jobStatus == 'transferring':
                     jobReady = True
                     failedFiles = []
                     noOutFiles = []
                     # check file status
                     for file in job.Files:
                         if file.type == 'output' or file.type == 'log':
                             if file.status == 'failed':
                                 failedFiles.append(file.lfn)
                             elif file.status == 'nooutput':
                                 noOutFiles.append(file.lfn)
                             elif file.status != 'ready':
                                 _logger.debug("Job: %s file:%s %s != ready" % (job.PandaID,file.lfn,file.status))
                                 jobReady = False
                                 break
                     # finish job
                     if jobReady:
                         if byCallback:
                             _logger.debug("Job: %s all files ready" % job.PandaID)
                         else:
                             _logger.debug("Job: %s all files checked with catalog" % job.PandaID)
                         # create XML
                         try:
                             import xml.dom.minidom
                             dom = xml.dom.minidom.getDOMImplementation()
                             doc = dom.createDocument(None,'xml',None)
                             topNode = doc.createElement("POOLFILECATALOG")
                             for file in job.Files:
                                 if file.type in ['output','log']:
                                     # skip failed or no-output files
                                     if file.lfn in failedFiles+noOutFiles:
                                         continue
                                     # File
                                     fileNode = doc.createElement("File")
                                     fileNode.setAttribute("ID",file.GUID)
                                     # LFN
                                     logNode = doc.createElement("logical")
                                     lfnNode = doc.createElement("lfn")
                                     lfnNode.setAttribute('name',file.lfn)
                                     # metadata
                                     fsizeNode    = doc.createElement("metadata")
                                     fsizeNode.setAttribute("att_name","fsize")
                                     fsizeNode.setAttribute("att_value",str(file.fsize))
                                     # checksum
                                     if file.checksum.startswith('ad:'):
                                         # adler32
                                         chksumNode    = doc.createElement("metadata")
                                         chksumNode.setAttribute("att_name","adler32")
                                         chksumNode.setAttribute("att_value",re.sub('^ad:','',file.checksum))
                                     else:
                                         # md5sum
                                         chksumNode    = doc.createElement("metadata")
                                         chksumNode.setAttribute("att_name","md5sum")
                                         chksumNode.setAttribute("att_value",re.sub('^md5:','',file.checksum))
                                     # append nodes
                                     logNode.appendChild(lfnNode)
                                     fileNode.appendChild(logNode)
                                     fileNode.appendChild(fsizeNode)
                                     fileNode.appendChild(chksumNode)
                                     topNode.appendChild(fileNode)
                             # status in file name
                             if failedFiles == []:
                                 statusFileName = 'finished'
                             else:
                                 statusFileName = 'failed'
                             # write to file
                             xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,job.PandaID,statusFileName,commands.getoutput('uuidgen'))
                             oXML = open(xmlFile,"w")
                             oXML.write(topNode.toxml())
                             oXML.close()
                         except:
                             type, value, traceBack = sys.exc_info()
                             _logger.error("Job: %s %s %s" % (job.PandaID,type,value))
                 _logger.debug("Job: %s status: %s" % (job.PandaID,job.jobStatus))                
         # end
         if self.job == None:        
             _logger.debug("end: %s" % self.dataset.name)
         else:
             _logger.debug("end: %s" % self.job.PandaID)
     except:
         type, value, traceBack = sys.exc_info()
         _logger.error("run() : %s %s" % (type,value))
Exemplo n.º 11
0
import re
import sys
import urllib2, urllib
from dq2.info import TiersOfATLAS

import userinterface.Client as Client
from userinterface.Client import baseURLSSL

from taskbuffer.TaskBuffer import taskBuffer
from brokerage.SiteMapper import SiteMapper
from config import panda_config

# instantiate TB
taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)
# instantiate sitemapper
siteMapper = SiteMapper(taskBuffer)

import httplib
import commands

id = sys.argv[1]
s, o = Client.getJobStatus([id])

if s != 0:
    print "failed to get job with:%s" % s
    sys.exit(0)

job = o[0]

if job == None:
    print "got None"
Exemplo n.º 12
0
# password
from config import panda_config
passwd = panda_config.dbpasswd

# logger
_logger = PandaLogger().getLogger('esPreemption')
tmpLog = LogWrapper(_logger)

tmpLog.debug("================= start ==================")

# instantiate TB
taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)

# instantiate sitemapper
siteMapper = SiteMapper(taskBuffer)

# time limit
timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=15)

# get low priority ES jobs per site
sqlEsJobs = "SELECT PandaID,computingSite,commandToPilot,startTime "
sqlEsJobs += "FROM {0}.jobsActive4 ".format(panda_config.schemaPANDA)
sqlEsJobs += "WHERE prodSourceLabel IN (:label1,:label2) AND eventService=:es "
sqlEsJobs += "AND currentPriority<:prio AND jobStatus=:jobStat "
sqlEsJobs += "ORDER BY currentPriority,PandaID "

varMap = {}
varMap[':label1'] = 'managed'
varMap[':label2'] = 'test'
varMap[':es'] = 1
Exemplo n.º 13
0
 def run(self):
     try:
         _logger.debug('%s Start %s' % (self.pandaID,self.job.jobStatus))
         flagComplete    = True
         ddmJobs         = []
         topUserDsList   = []
         usingMerger     = False        
         disableNotifier = False
         firstIndvDS     = True
         finalStatusDS   = []
         for destinationDBlock in self.destinationDBlocks:
             dsList = []
             _logger.debug('%s start %s' % (self.pandaID,destinationDBlock))
             # ignore tid datasets
             if re.search('_tid[\d_]+$',destinationDBlock):
                 _logger.debug('%s skip %s' % (self.pandaID,destinationDBlock))                
                 continue
             # ignore HC datasets
             if re.search('^hc_test\.',destinationDBlock) != None or re.search('^user\.gangarbt\.',destinationDBlock) != None:
                 if re.search('_sub\d+$',destinationDBlock) == None and re.search('\.lib$',destinationDBlock) == None:
                     _logger.debug('%s skip HC %s' % (self.pandaID,destinationDBlock))                
                     continue
             # query dataset
             if self.datasetMap.has_key(destinationDBlock):
                 dataset = self.datasetMap[destinationDBlock]
             else:
                 dataset = self.taskBuffer.queryDatasetWithMap({'name':destinationDBlock})
             if dataset == None:
                 _logger.error('%s Not found : %s' % (self.pandaID,destinationDBlock))
                 flagComplete = False
                 continue
             # skip tobedeleted/tobeclosed 
             if dataset.status in ['cleanup','tobeclosed','completed']:
                 _logger.debug('%s skip %s due to %s' % (self.pandaID,destinationDBlock,dataset.status))
                 continue
             dsList.append(dataset)
             # sort
             dsList.sort()
             # count number of completed files
             notFinish = self.taskBuffer.countFilesWithMap({'destinationDBlock':destinationDBlock,
                                                            'status':'unknown'})
             if notFinish < 0:
                 _logger.error('%s Invalid DB return : %s' % (self.pandaID,notFinish))
                 flagComplete = False                
                 continue
             # check if completed
             _logger.debug('%s notFinish:%s' % (self.pandaID,notFinish))
             if self.job.destinationSE == 'local' and self.job.prodSourceLabel in ['user','panda']:
                 # close non-DQ2 destinationDBlock immediately
                 finalStatus = 'closed'
             elif self.job.lockedby == 'jedi' and self.isTopLevelDS(destinationDBlock):
                 # set it closed in order not to trigger DDM cleanup. It will be closed by JEDI
                 finalStatus = 'closed'
             elif self.job.prodSourceLabel in ['user'] and "--mergeOutput" in self.job.jobParameters \
                      and self.job.processingType != 'usermerge':
                 # merge output files
                 if firstIndvDS:
                     # set 'tobemerged' to only the first dataset to avoid triggering many Mergers for --individualOutDS
                     finalStatus = 'tobemerged'
                     firstIndvDS = False
                 else:
                     finalStatus = 'tobeclosed'
                 # set merging to top dataset
                 usingMerger = True
                 # disable Notifier
                 disableNotifier = True
             elif self.job.produceUnMerge():
                 finalStatus = 'doing'
             else:
                 # set status to 'tobeclosed' to trigger DQ2 closing
                 finalStatus = 'tobeclosed'
             if notFinish==0: 
                 _logger.debug('%s set %s to dataset : %s' % (self.pandaID,finalStatus,destinationDBlock))
                 # set status
                 dataset.status = finalStatus
                 # update dataset in DB
                 retT = self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ",
                                                       criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'})
                 if len(retT) > 0 and retT[0]==1:
                     finalStatusDS += dsList
                     # close user datasets
                     if self.job.prodSourceLabel in ['user'] and self.job.destinationDBlock.endswith('/') \
                            and (dataset.name.startswith('user') or dataset.name.startswith('group')):
                         # get top-level user dataset 
                         topUserDsName = re.sub('_sub\d+$','',dataset.name)
                         # update if it is the first attempt
                         if topUserDsName != dataset.name and not topUserDsName in topUserDsList and self.job.lockedby != 'jedi':
                             topUserDs = self.taskBuffer.queryDatasetWithMap({'name':topUserDsName})
                             if topUserDs != None:
                                 # check status
                                 if topUserDs.status in ['completed','cleanup','tobeclosed',
                                                         'tobemerged','merging']:
                                     _logger.debug('%s skip %s due to status=%s' % (self.pandaID,topUserDsName,topUserDs.status))
                                 else:
                                     # set status
                                     if self.job.processingType.startswith('gangarobot') or \
                                            self.job.processingType.startswith('hammercloud'):
                                         # not trigger freezing for HC datasets so that files can be appended
                                         topUserDs.status = 'completed'
                                     elif not usingMerger:
                                         topUserDs.status = finalStatus
                                     else:
                                         topUserDs.status = 'merging'
                                     # append to avoid repetition
                                     topUserDsList.append(topUserDsName)
                                     # update DB
                                     retTopT = self.taskBuffer.updateDatasets([topUserDs],withLock=True,withCriteria="status<>:crStatus",
                                                                              criteriaMap={':crStatus':topUserDs.status})
                                     if len(retTopT) > 0 and retTopT[0]==1:
                                         _logger.debug('%s set %s to top dataset : %s' % (self.pandaID,topUserDs.status,topUserDsName))
                                     else:
                                         _logger.debug('%s failed to update top dataset : %s' % (self.pandaID,topUserDsName))
                         # get parent dataset for merge job
                         if self.job.processingType == 'usermerge':
                             tmpMatch = re.search('--parentDS ([^ \'\"]+)',self.job.jobParameters)
                             if tmpMatch == None:
                                 _logger.error('%s failed to extract parentDS' % self.pandaID)
                             else:
                                 unmergedDsName = tmpMatch.group(1)
                                 # update if it is the first attempt
                                 if not unmergedDsName in topUserDsList:
                                     unmergedDs = self.taskBuffer.queryDatasetWithMap({'name':unmergedDsName})
                                     if unmergedDs == None:
                                         _logger.error('%s failed to get parentDS=%s from DB' % (self.pandaID,unmergedDsName))
                                     else:
                                         # check status
                                         if unmergedDs.status in ['completed','cleanup','tobeclosed']:
                                             _logger.debug('%s skip %s due to status=%s' % (self.pandaID,unmergedDsName,unmergedDs.status))
                                         else:
                                             # set status
                                             unmergedDs.status = finalStatus
                                             # append to avoid repetition
                                             topUserDsList.append(unmergedDsName)
                                             # update DB
                                             retTopT = self.taskBuffer.updateDatasets([unmergedDs],withLock=True,withCriteria="status<>:crStatus",
                                                                                      criteriaMap={':crStatus':unmergedDs.status})
                                             if len(retTopT) > 0 and retTopT[0]==1:
                                                 _logger.debug('%s set %s to parent dataset : %s' % (self.pandaID,unmergedDs.status,unmergedDsName))
                                             else:
                                                 _logger.debug('%s failed to update parent dataset : %s' % (self.pandaID,unmergedDsName))
                     if self.pandaDDM and self.job.prodSourceLabel=='managed':
                         # instantiate SiteMapper
                         if self.siteMapper == None:
                             self.siteMapper = SiteMapper(self.taskBuffer)
                         # get file list for PandaDDM
                         retList = self.taskBuffer.queryFilesWithMap({'destinationDBlock':destinationDBlock})
                         lfnsStr = ''
                         guidStr = ''
                         for tmpFile in retList:
                             if tmpFile.type in ['log','output']:
                                 lfnsStr += '%s,' % tmpFile.lfn
                                 guidStr += '%s,' % tmpFile.GUID
                         if lfnsStr != '':
                             guidStr = guidStr[:-1]
                             lfnsStr = lfnsStr[:-1]
                             # create a DDM job
                             ddmjob = JobSpec()
                             ddmjob.jobDefinitionID   = int(time.time()) % 10000
                             ddmjob.jobName           = "%s" % commands.getoutput('uuidgen')
                             ddmjob.transformation    = 'http://pandaserver.cern.ch:25080/trf/mover/run_dq2_cr'
                             ddmjob.destinationDBlock = 'testpanda.%s' % ddmjob.jobName
                             ddmjob.computingSite     = "BNL_ATLAS_DDM"
                             ddmjob.destinationSE     = ddmjob.computingSite
                             ddmjob.currentPriority   = 200000
                             ddmjob.prodSourceLabel   = 'ddm'
                             ddmjob.transferType      = 'sub'
                             # append log file
                             fileOL = FileSpec()
                             fileOL.lfn = "%s.job.log.tgz" % ddmjob.jobName
                             fileOL.destinationDBlock = ddmjob.destinationDBlock
                             fileOL.destinationSE     = ddmjob.destinationSE
                             fileOL.dataset           = ddmjob.destinationDBlock
                             fileOL.type = 'log'
                             ddmjob.addFile(fileOL)
                             # make arguments
                             dstDQ2ID = 'BNLPANDA'
                             srcDQ2ID = self.siteMapper.getSite(self.job.computingSite).ddm
                             callBackURL = 'https://%s:%s/server/panda/datasetCompleted?vuid=%s&site=%s' % \
                                           (panda_config.pserverhost,panda_config.pserverport,
                                            dataset.vuid,dstDQ2ID)
                             _logger.debug(callBackURL)
                             # set src/dest
                             ddmjob.sourceSite      = srcDQ2ID
                             ddmjob.destinationSite = dstDQ2ID
                             # if src==dst, send callback without ddm job
                             if dstDQ2ID == srcDQ2ID:
                                 comout = commands.getoutput('curl -k %s' % callBackURL)
                                 _logger.debug(comout)
                             else:
                                 # run dq2_cr
                                 callBackURL = urllib.quote(callBackURL)
                                 # get destination dir
                                 destDir = brokerage.broker_util._getDefaultStorage(self.siteMapper.getSite(self.job.computingSite).dq2url)
                                 argStr = "-s %s -r %s --guids %s --lfns %s --callBack %s -d %s/%s %s" % \
                                          (srcDQ2ID,dstDQ2ID,guidStr,lfnsStr,callBackURL,destDir,
                                           destinationDBlock,destinationDBlock)
                                 # set job parameters
                                 ddmjob.jobParameters = argStr
                                 _logger.debug('%s pdq2_cr %s' % (self.pandaID,ddmjob.jobParameters))
                                 ddmJobs.append(ddmjob)
                     # start Activator
                     if re.search('_sub\d+$',dataset.name) == None:
                         if self.job.prodSourceLabel=='panda' and self.job.processingType in ['merge','unmerge']:
                             # don't trigger Activator for merge jobs
                             pass
                         else:
                             if self.job.jobStatus == 'finished':
                                 aThr = Activator(self.taskBuffer,dataset)
                                 aThr.start()
                                 aThr.join()
                 else:
                     # unset flag since another thread already updated 
                     #flagComplete = False
                     pass
             else:
                 # update dataset in DB
                 self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ",
                                                criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'})
                 # unset flag
                 flagComplete = False
             # end
             _logger.debug('%s end %s' % (self.pandaID,destinationDBlock))
         # start DDM jobs
         if ddmJobs != []:
             self.taskBuffer.storeJobs(ddmJobs,self.job.prodUserID,joinThr=True)
         # change pending jobs to failed
         finalizedFlag = True
         if flagComplete and self.job.prodSourceLabel=='user':
             _logger.debug('%s finalize %s %s' % (self.pandaID,self.job.prodUserName,self.job.jobDefinitionID))
             finalizedFlag = self.taskBuffer.finalizePendingJobs(self.job.prodUserName,self.job.jobDefinitionID,waitLock=True)
             _logger.debug('%s finalized with %s' % (self.pandaID,finalizedFlag))
         # update unmerged datasets in JEDI to trigger merging
         if flagComplete and self.job.produceUnMerge() and finalStatusDS != []:
             if finalizedFlag:
                 self.taskBuffer.updateUnmergedDatasets(self.job,finalStatusDS)
         # start notifier
         _logger.debug('%s source:%s complete:%s' % (self.pandaID,self.job.prodSourceLabel,flagComplete))
         if (self.job.jobStatus != 'transferring') and ((flagComplete and self.job.prodSourceLabel=='user') or \
            (self.job.jobStatus=='failed' and self.job.prodSourceLabel=='panda')) and \
            self.job.lockedby != 'jedi':
             # don't send email for merge jobs
             if (not disableNotifier) and not self.job.processingType in ['merge','unmerge']:
                 useNotifier = True
                 summaryInfo = {}
                 # check all jobDefIDs in jobsetID
                 if not self.job.jobsetID in [0,None,'NULL']:
                     useNotifier,summaryInfo = self.taskBuffer.checkDatasetStatusForNotifier(self.job.jobsetID,self.job.jobDefinitionID,
                                                                                             self.job.prodUserName)
                     _logger.debug('%s useNotifier:%s' % (self.pandaID,useNotifier))
                 if useNotifier:
                     _logger.debug('%s start Notifier' % self.pandaID)
                     nThr = Notifier.Notifier(self.taskBuffer,self.job,self.destinationDBlocks,summaryInfo)
                     nThr.run()
                     _logger.debug('%s end Notifier' % self.pandaID)                    
         _logger.debug('%s End' % self.pandaID)
     except:
         errType,errValue = sys.exc_info()[:2]
         _logger.error("%s %s" % (errType,errValue))
Exemplo n.º 14
0
import re
import sys
import urllib2,urllib
from dq2.info import TiersOfATLAS

import userinterface.Client as Client
from userinterface.Client import baseURLSSL

from taskbuffer.TaskBuffer import taskBuffer
from brokerage.SiteMapper import SiteMapper
from config import panda_config

# instantiate TB
taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
# instantiate sitemapper
siteMapper = SiteMapper(taskBuffer)


import httplib
import commands

id = sys.argv[1]
s,o = Client.getJobStatus([id])

if s != 0:
    print "failed to get job with:%s" % s
    sys.exit(0)

job = o[0]

if job == None:
Exemplo n.º 15
0
        startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6])
        # kill old process
        if startTime < timeLimit:
            _logger.debug("old process : %s %s" % (pid,startTime))
            _logger.debug(line)            
            commands.getoutput('kill -9 %s' % pid)
except:
    type, value, traceBack = sys.exc_info()
    _logger.error("kill process : %s %s" % (type,value))
    

# instantiate TB
taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)

# instantiate sitemapper
siteMapper = SiteMapper(taskBuffer)

_memoryCheck("rebroker")

# rebrokerage
_logger.debug("Rebrokerage start")
try:
    normalTimeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=24)
    sortTimeLimit   = datetime.datetime.utcnow() - datetime.timedelta(hours=3)
    sql  = "SELECT jobDefinitionID,prodUserName,prodUserID,computingSite,MAX(modificationTime),jediTaskID,processingType "
    sql += "FROM ATLAS_PANDA.jobsActive4 "
    sql += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND jobStatus IN (:jobStatus1,:jobStatus2) "
    sql += "AND modificationTime<:modificationTime "
    sql += "AND jobsetID IS NOT NULL "    
    sql += "AND lockedBy=:lockedBy "
    sql += "GROUP BY jobDefinitionID,prodUserName,prodUserID,computingSite,jediTaskID,processingType " 
Exemplo n.º 16
0
        startTime = datetime.datetime(
            *time.strptime(timeM.group(1), '%b %d %H:%M:%S %Y')[:6])
        # kill old process
        if startTime < timeLimit:
            tmpLog.debug("old process : %s %s" % (pid, startTime))
            tmpLog.debug(line)
            commands.getoutput('kill -9 %s' % pid)
except:
    type, value, traceBack = sys.exc_info()
    tmpLog.error("kill process : %s %s" % (type, value))

# instantiate TB
taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)

# instantiate sitemapper
aSiteMapper = SiteMapper(taskBuffer)

# delete
tmpLog.debug("Del session")
status, retSel = taskBuffer.querySQLS(
    "SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4", {})
if retSel != None:
    try:
        maxID = retSel[0][0]
        tmpLog.debug("maxID : %s" % maxID)
        if maxID != None:
            varMap = {}
            varMap[':maxID'] = maxID
            varMap[':jobStatus1'] = 'activated'
            varMap[':jobStatus2'] = 'waiting'
            varMap[':jobStatus3'] = 'failed'
Exemplo n.º 17
0
# password
from config import panda_config
passwd = panda_config.dbpasswd

# logger
_logger = PandaLogger().getLogger('esPreemption')
tmpLog = LogWrapper(_logger)


tmpLog.debug("================= start ==================")

# instantiate TB
taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)

# instantiate sitemapper
siteMapper = SiteMapper(taskBuffer)

# time limit
timeLimit = datetime.datetime.utcnow()-datetime.timedelta(minutes=15)

# get low priority ES jobs per site
sqlEsJobs  = "SELECT PandaID,computingSite,commandToPilot,startTime "
sqlEsJobs += "FROM {0}.jobsActive4 ".format(panda_config.schemaPANDA)
sqlEsJobs += "WHERE prodSourceLabel IN (:label1,:label2) AND eventService=:es "
sqlEsJobs += "AND currentPriority<:prio AND jobStatus=:jobStat "
sqlEsJobs += "ORDER BY currentPriority,PandaID "

varMap = {}
varMap[':label1'] = 'managed'
varMap[':label2'] = 'test'
varMap[':es'] = 1