def __call__(self, *args, **kwargs): log = LogWrapper( _logger, 'pid={} thr={} {}'.format(os.getpid(), threading.current_thread().ident, self.methodName)) log.debug('start') # get lock among children i = self.childlock.get() # make dict to send it master self.commDict[i].update({ 'methodName': self.methodName, 'args': pickle.dumps(args), 'kwargs': pickle.dumps(kwargs) }) # send notification to master self.comLock[i].release() # wait response self.resLock[i].acquire() res = pickle.loads(self.commDict[i]['res']) statusCode = self.commDict[i]['stat'] # release lock to children self.childlock.put(i) log.debug('end') # return if statusCode == 0: return res else: errtype, errvalue = res raise RuntimeError("{0}: {1} {2}".format(self.methodName, errtype.__name__, errvalue))
class CloserAtlasPlugin: # constructor def __init__(self,job,datasets,log): self.jobSpec = job self.datasets = datasets self.tmpLog = LogWrapper(log,"{0} CloserAtlasPlugin".format(self.jobSpec.PandaID)) # execute def execute(self): try: # only for production if not self.jobSpec.prodSourceLabel in ['managed','test']: return True # only for urgent or high prio if not self.jobSpec.processingType in ['urgent'] and self.jobSpec.currentPriority <= 1000: return True # close datasets for datasetSpec in self.datasets: if re.search('_sub\d+$',datasetSpec.name) is None: continue if datasetSpec.status != 'tobeclosed': continue try: self.tmpLog.debug('immediate close {0}'.format(datasetSpec.name)) rucioAPI.closeDataset(datasetSpec.name) except Exception: errtype,errvalue = sys.exc_info()[:2] self.tmpLog.warning('failed to close : {0} {1}'.format(errtype,errvalue)) except Exception: errtype,errvalue = sys.exc_info()[:2] self.tmpLog.warning('failed to execute : {0} {1}'.format(errtype,errvalue)) return True
def run(self): # get logger tmpLog = LogWrapper(_logger,'<vuid={0} site={1} name={2}>'.format(self.vuid, self.site, self.dataset)) # query dataset tmpLog.debug("start") if self.vuid is not None: dataset = self.taskBuffer.queryDatasetWithMap({'vuid':self.vuid}) else: dataset = self.taskBuffer.queryDatasetWithMap({'name':self.dataset}) if dataset is None: tmpLog.error("Not found") tmpLog.debug("end") return tmpLog.debug("type:%s name:%s" % (dataset.type,dataset.name)) if dataset.type == 'dispatch': # activate jobs in jobsDefined Activator(self.taskBuffer,dataset).start() if dataset.type == 'output': if dataset.name is not None and re.search('^panda\..*_zip$',dataset.name) is not None: # start unmerge jobs Activator(self.taskBuffer,dataset,enforce=True).start() else: # finish transferring jobs Finisher(self.taskBuffer,dataset,site=self.site).start() tmpLog.debug("end")
def delete_checkpoint(req, task_id, sub_id): tmpLog = LogWrapper(_logger, 'delete_checkpoint <jediTaskID={0} ID={1}>'.format(task_id, sub_id)) status = True if not Protocol.isSecure(req): msg = 'insecure request' tmpLog.error(msg) status = False else: tmpLog.debug("start %s" % req.subprocess_env['SSL_CLIENT_S_DN']) try: fileFullPath = os.path.join(panda_config.cache_dir, get_checkpoint_filename(task_id, sub_id)) os.remove(fileFullPath) msg = 'done' tmpLog.debug(msg) except Exception as e: msg = "failed to delete file due to {0}".format(str(e)) tmpLog.error(msg) status = False return json.dumps({'status': status, 'message': msg})
def put_workflow_request(req, data, check=False): if not Protocol.isSecure(req): return json.dumps((False, "ERROR : no HTTPS")) userName = req.subprocess_env['SSL_CLIENT_S_DN'] creationTime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') tmpLog = LogWrapper(_logger, 'put_workflow_request') tmpLog.debug("start user={} check={}".format(userName, check)) if check == 'True' or check is True: check = True else: check = False # get total size try: # make filename evpFileName = '%s/workflow.%s' % (panda_config.cache_dir,str(uuid.uuid4())) tmpLog.debug("file={}".format(evpFileName)) # write with open(evpFileName, 'w') as fo: data = {"userName": userName, "creationTime": creationTime, "data": json.loads(data), } json.dump(data, fo) # check if check: tmpLog.debug('checking') from pandaserver.taskbuffer.workflow_processor import WorkflowProcessor processor = WorkflowProcessor(log_stream=_logger) ret = processor.process(evpFileName, True, True, True, True) if os.path.exists(evpFileName): try: os.remove(evpFileName) except Exception: pass tmpLog.debug('done') return json.dumps((True, ret)) except Exception as e: errStr = "cannot put request due to {} ".format(str(e)) tmpLog.error(errStr + traceback.format_exc()) return json.dumps((False, errStr)) tmpLog.debug('done') return json.dumps((True, 'request was accepted and will be processed in a few minutes'))
def put_file_recovery_request(req, jediTaskID, dryRun=None): if not Protocol.isSecure(req): return json.dumps((False, "ERROR : no HTTPS")) userName = req.subprocess_env['SSL_CLIENT_S_DN'] creationTime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') tmpLog = LogWrapper(_logger, 'put_file_recovery_request < jediTaskID={}'.format(jediTaskID)) tmpLog.debug("start user={}".format(userName)) # get total size try: jediTaskID = int(jediTaskID) # make filename evpFileName = '%s/recov.%s' % (panda_config.cache_dir,str(uuid.uuid4())) tmpLog.debug("file={}".format(evpFileName)) # write with open(evpFileName, 'w') as fo: data = {"userName": userName, "creationTime": creationTime, "jediTaskID": int(jediTaskID) } if dryRun: data['dryRun'] = True json.dump(data, fo) except Exception as e: errStr = "cannot put request due to {} ".format(str(e)) tmpLog.error(errStr + traceback.format_exc()) return json.dumps((False, errStr)) tmpLog.debug('done') return json.dumps((True, 'request was accepted and will be processed in a few minutes'))
def main(tbuf=None, **kwargs): # logger tmpLog = LogWrapper(_logger) tmpLog.debug("================= start ==================") # instantiate TB if tbuf is None: from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) else: taskBuffer = tbuf # instantiate MyProxy I/F my_proxy_interface_instance = panda_proxy_cache.MyProxyInterface() # roles if hasattr(panda_config,'proxy_cache_roles'): roles = panda_config.proxy_cache_roles.split(',') else: roles = ['atlas','atlas:/atlas/Role=production','atlas:/atlas/Role=pilot'] # get users sql = 'select distinct DN FROM ATLAS_PANDAMETA.users WHERE GRIDPREF LIKE :patt' varMap = {} varMap[':patt'] = '%p%' tmpStat,tmpRes = taskBuffer.querySQLS(sql,varMap) for realDN, in tmpRes: if realDN is None: continue realDN = CoreUtils.get_bare_dn(realDN, keep_digits=False) name = taskBuffer.cleanUserID(realDN) # check proxy tmpLog.debug("check proxy cache for {}".format(name)) for role in roles: my_proxy_interface_instance.checkProxy(realDN, role=role, name=name) tmpLog.debug("done")
def put_checkpoint(req, file): tmpLog = LogWrapper(_logger, 'put_checkpoint <jediTaskID_subID={0}>'.format(file.filename)) status = False if not Protocol.isSecure(req): errStr = 'insecure request' tmpLog.error(errStr) return json.dumps({'status': status, 'message': errStr}) tmpLog.debug("start %s" % req.subprocess_env['SSL_CLIENT_S_DN']) # extract taskID and subID try: task_id, sub_id = file.filename.split('/')[-1].split('_') except Exception: errStr = 'failed to extract ID' tmpLog.error(errStr) return json.dumps({'status': status, 'message': errStr}) # size check sizeLimit = 500 * 1024 * 1024 # get file size try: contentLength = long(req.headers_in["content-length"]) except Exception as e: errStr = "cannot get int(content-length) due to {0}".format(str(e)) tmpLog.error(errStr) return json.dumps({'status': status, 'message': errStr}) tmpLog.debug("size %s" % contentLength) if contentLength > sizeLimit: errStr = "exceeded size limit %s>%s" % (contentLength, sizeLimit) tmpLog.error(errStr) return json.dumps({'status': status, 'message': errStr}) try: fileFullPath = os.path.join(panda_config.cache_dir, get_checkpoint_filename(task_id, sub_id)) # write with open(fileFullPath,'wb') as fo: fo.write(file.file.read()) except Exception as e: errStr = "cannot write file due to {0}".format(str(e)) tmpLog.error(errStr) return json.dumps({'status': status, 'message': errStr}) status = True tmpMsg = "successfully placed at {0}".format(fileFullPath) tmpLog.debug(tmpMsg) return json.dumps({'status': status, 'message': tmpMsg})
def application(environ, start_response): # get method name methodName = '' if 'SCRIPT_NAME' in environ: methodName = environ['SCRIPT_NAME'].split('/')[-1] tmpLog = LogWrapper(_logger, "PID={0} {1}".format(os.getpid(), methodName)) tmpLog.debug("start") regStart = datetime.datetime.utcnow() retType = None # check method name if not methodName in allowedMethods: tmpLog.error("is forbidden") exeRes = "False : %s is forbidden" % methodName else: # get method object tmpMethod = None try: tmpMethod = globals()[methodName] except Exception: pass # object not found if tmpMethod is None: tmpLog.error("is undefined") exeRes = "False" else: try: # get params tmpPars = cgi.FieldStorage(environ['wsgi.input'], environ=environ, keep_blank_values=1) # convert to map params = {} for tmpKey in list(tmpPars): if tmpPars[tmpKey].file is not None and tmpPars[tmpKey].filename is not None: # file params[tmpKey] = tmpPars[tmpKey] else: # string params[tmpKey] = tmpPars.getfirst(tmpKey) if panda_config.entryVerbose: tmpLog.debug("with %s" % str(list(params))) # dummy request object dummyReq = DummyReq(environ, tmpLog) param_list = [dummyReq] # exec exeRes = tmpMethod(*param_list, **params) # extract return type if isinstance(exeRes, dict): retType = exeRes['type'] exeRes = exeRes['content'] # convert bool to string if exeRes in [True,False]: exeRes = str(exeRes) except Exception as e: tmpLog.error("execution failure : {0}".format(str(e))) errStr = "" for tmpKey in environ: tmpVal = environ[tmpKey] errStr += "%s : %s\n" % (tmpKey,str(tmpVal)) tmpLog.error(errStr) # return internal server error start_response('500 INTERNAL SERVER ERROR', [('Content-Type', 'text/plain')]) return [str(e)] if panda_config.entryVerbose: tmpLog.debug("done") regTime = datetime.datetime.utcnow() - regStart tmpLog.info("exec_time=%s.%03d sec, return len=%s B" % (regTime.seconds, regTime.microseconds/1000, len(str(exeRes)))) # return if exeRes == pandaserver.taskbuffer.ErrorCode.EC_NotFound: start_response('404 Not Found', [('Content-Type', 'text/plain')]) return ['not found'] elif isinstance(exeRes, pandaserver.taskbuffer.ErrorCode.EC_Redirect): start_response('302 Redirect', [('Location', exeRes.url)]) return ['redirect'] else: if retType == 'json': start_response('200 OK', [('Content-Type', 'application/json')]) else: start_response('200 OK', [('Content-Type', 'text/plain')]) if isinstance(exeRes, str): exeRes = exeRes.encode() return [exeRes]
from pandaserver.config import panda_config from pandaserver.taskbuffer.TaskBuffer import taskBuffer from pandacommon.pandalogger.PandaLogger import PandaLogger from pandacommon.pandalogger.LogWrapper import LogWrapper from pandaserver.proxycache import panda_proxy_cache # logger _logger = PandaLogger().getLogger('panda_activeusers_query') tmpLog = LogWrapper(_logger) if __name__ == '__main__': tmpLog.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate MyProxy I/F my_proxy_interface_instance = panda_proxy_cache.MyProxyInterface() # roles if hasattr(panda_config, 'proxy_cache_roles'): roles = panda_config.proxy_cache_roles.split(',') else: roles = [ 'atlas', 'atlas:/atlas/Role=production', 'atlas:/atlas/Role=pilot' ]
def putFile(req, file): tmpLog = LogWrapper(_logger, 'putFile-{}'.format(datetime.datetime.utcnow().isoformat('/'))) if not Protocol.isSecure(req): tmpLog.error('No SSL_CLIENT_S_DN') return False if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']: return False # user name username = CoreUtils.clean_user_id(req.subprocess_env['SSL_CLIENT_S_DN']) tmpLog.debug("start %s %s" % (username, file.filename)) # size check fullSizeLimit = 768*1024*1024 if not file.filename.startswith('sources.'): noBuild = True sizeLimit = 100*1024*1024 else: noBuild = False sizeLimit = fullSizeLimit # get file size contentLength = 0 try: contentLength = long(req.headers_in["content-length"]) except Exception: if "content-length" in req.headers_in: tmpLog.error("cannot get CL : %s" % req.headers_in["content-length"]) else: tmpLog.error("no CL") tmpLog.debug("size %s" % contentLength) if contentLength > sizeLimit: errStr = "ERROR : Upload failure. Exceeded size limit %s>%s." % (contentLength,sizeLimit) if noBuild: errStr += " Please submit the job without --noBuild/--libDS since those options impose a tighter size limit" else: errStr += " Please remove redundant files from your workarea" tmpLog.error(errStr) tmpLog.debug("end") return errStr try: fileName = file.filename.split('/')[-1] fileFullPath = '%s/%s' % (panda_config.cache_dir, fileName) # avoid overwriting if os.path.exists(fileFullPath): # touch os.utime(fileFullPath,None) # send error message errStr = "ERROR : Cannot overwrite file" tmpLog.debug('cannot overwrite file %s' % fileName) tmpLog.debug("end") return errStr # write fo = open(fileFullPath,'wb') fileContent = file.file.read() if hasattr(panda_config, 'compress_file_names') and \ [True for patt in panda_config.compress_file_names.split(',') if re.search(patt, fileName) is not None]: fileContent = gzip.compress(fileContent) fo.write(fileContent) fo.close() except Exception: errStr = "ERROR : Cannot write file" tmpLog.error(errStr) tmpLog.debug("end") return errStr # checksum try: # decode Footer footer = fileContent[-8:] checkSum,isize = struct.unpack("II",footer) tmpLog.debug("CRC from gzip Footer %s" % checkSum) except Exception: # calculate on the fly """ import zlib checkSum = zlib.adler32(fileContent) & 0xFFFFFFFF """ # use None to avoid delay for now checkSum = None tmpLog.debug("CRC calculated %s" % checkSum) # file size fileSize = len(fileContent) tmpLog.debug("written dn=%s file=%s size=%s crc=%s" % \ (username, fileFullPath, fileSize, checkSum)) # put file info to DB if panda_config.record_sandbox_info: to_insert = True for patt in IGNORED_SUFFIX: if file.filename.endswith(patt): to_insert = False break if not to_insert: tmpLog.debug("skipped to insert to DB") else: statClient,outClient = Client.insertSandboxFileInfo(username,file.filename, fileSize,checkSum) if statClient != 0 or outClient.startswith("ERROR"): tmpLog.error("failed to put sandbox to DB with %s %s" % (statClient,outClient)) #_logger.debug("putFile : end") #return "ERROR : Cannot insert sandbox to DB" else: tmpLog.debug("inserted sandbox to DB with %s" % outClient) tmpLog.debug("end") return True
def main(argv=tuple(), tbuf=None, **kwargs): try: long except NameError: long = int prelock_pid = GenericThread().get_pid() tmpLog = LogWrapper(_logger, "<pid={}>".format(prelock_pid)) tmpLog.debug("===================== start =====================") # return value, true to run main again in next daemon loop ret_val = True # grace period try: gracePeriod = int(argv[1]) except Exception: gracePeriod = 1 # lock interval in minutes lock_interval = 10 # retry interval in minutes retry_interval = 3 # instantiate TB if tbuf is None: from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) else: taskBuffer = tbuf # instantiate sitemapper aSiteMapper = SiteMapper(taskBuffer) # thread for adder class AdderThread(GenericThread): def __init__(self, taskBuffer, aSiteMapper, job_output_reports): GenericThread.__init__(self) self.taskBuffer = taskBuffer self.aSiteMapper = aSiteMapper self.job_output_reports = job_output_reports # main loop def run(self): # initialize taskBuffer = self.taskBuffer aSiteMapper = self.aSiteMapper # get file list timeNow = datetime.datetime.utcnow() timeInt = datetime.datetime.utcnow() # unique pid GenericThread.__init__(self) uniq_pid = self.get_pid() # log pid tmpLog.debug("pid={0} : run".format(uniq_pid)) # stats n_processed = 0 # loop while True: # get report one_jor = self.job_output_reports.pop() if not one_jor: break # lock panda_id, job_status, attempt_nr, time_stamp = one_jor got_lock = taskBuffer.lockJobOutputReport( panda_id=panda_id, attempt_nr=attempt_nr, pid=uniq_pid, time_limit=lock_interval) if not got_lock: continue # add try: modTime = time_stamp if (timeNow - modTime) > datetime.timedelta(hours=24): # last add tmpLog.debug( "pid={0} : last add job={1}.{2} st={3}".format( uniq_pid, panda_id, attempt_nr, job_status)) ignoreTmpError = False else: # usual add tmpLog.debug("pid={0} : add job={1}.{2} st={3}".format( uniq_pid, panda_id, attempt_nr, job_status)) ignoreTmpError = True # get adder adder_gen = AdderGen(taskBuffer, panda_id, job_status, attempt_nr, ignoreTmpError=ignoreTmpError, siteMapper=aSiteMapper, pid=uniq_pid, prelock_pid=uniq_pid, lock_offset=lock_interval - retry_interval) n_processed += 1 # execute adder_gen.run() del adder_gen except Exception as e: tmpLog.error("pid={} : failed to run with {} {}".format( uniq_pid, str(e), traceback.format_exc())) # stats tmpLog.debug("pid={} : processed {}".format(uniq_pid, n_processed)) # launcher, run with multiprocessing def proc_launch(self): # run self.process = multiprocessing.Process(target=self.run) self.process.start() # join of multiprocessing def proc_join(self): self.process.join() # TaskBuffer with more connections behind TaskBufferInterface tmpLog.debug("setup taskBufferIF") n_connections = 4 _tbuf = TaskBuffer() _tbuf.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=n_connections) taskBufferIF = TaskBufferInterface() taskBufferIF.launch(_tbuf) # add files tmpLog.debug("run Adder") interval = 10 nLoop = 10 for iLoop in range(10): tmpLog.debug('start iLoop={}/{}'.format(iLoop, nLoop)) start_time = datetime.datetime.utcnow() adderThrList = [] nThr = 10 n_jors_per_batch = 1000 jor_lists = WeightedLists(multiprocessing.Lock()) # get some job output reports jor_list_others = taskBuffer.listJobOutputReport( only_unlocked=True, time_limit=lock_interval, limit=n_jors_per_batch * nThr, grace_period=gracePeriod, anti_labels=['user']) jor_lists.add(3, jor_list_others) jor_list_user = taskBuffer.listJobOutputReport( only_unlocked=True, time_limit=lock_interval, limit=n_jors_per_batch * nThr, grace_period=gracePeriod, labels=['user']) jor_lists.add(7, jor_list_user) # adder consumer processes _n_thr_with_tbuf = 0 tbuf_list = [] tmpLog.debug("got {} job reports".format(len(jor_lists))) for i in range(nThr): if i < _n_thr_with_tbuf: tbuf = TaskBuffer() tbuf_list.append(tbuf) tbuf.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) thr = AdderThread(tbuf, aSiteMapper, jor_lists) else: thr = AdderThread(taskBufferIF.getInterface(), aSiteMapper, jor_lists) adderThrList.append(thr) # start all threads for thr in adderThrList: # thr.start() thr.proc_launch() time.sleep(0.25) # join all threads for thr in adderThrList: # thr.join() thr.proc_join() [tbuf.cleanup() for tbuf in tbuf_list] end_time = datetime.datetime.utcnow() sleep_time = interval - (end_time - start_time).seconds if sleep_time > 0 and iLoop + 1 < nLoop: sleep_time = random.randint(1, sleep_time) tmpLog.debug("sleep {} sec".format(sleep_time)) time.sleep(sleep_time) # stop TaskBuffer IF taskBufferIF.stop() tmpLog.debug("===================== end =====================") # return return ret_val
def main(argv=tuple(), tbuf=None, **kwargs): try: long except NameError: long = int tmpLog = LogWrapper(_logger, None) tmpLog.debug("===================== start =====================") # current minute currentMinute = datetime.datetime.utcnow().minute # instantiate TB if tbuf is None: from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) else: taskBuffer = tbuf # instantiate sitemapper aSiteMapper = SiteMapper(taskBuffer) # delete tmpLog.debug("Del session") status, retSel = taskBuffer.querySQLS( "SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4", {}) if retSel is not None: try: maxID = retSel[0][0] tmpLog.debug("maxID : %s" % maxID) if maxID is not None: varMap = {} varMap[':maxID'] = maxID varMap[':jobStatus1'] = 'activated' varMap[':jobStatus2'] = 'waiting' varMap[':jobStatus3'] = 'failed' varMap[':jobStatus4'] = 'cancelled' status, retDel = taskBuffer.querySQLS( "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID<:maxID AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3,:jobStatus4)", varMap) except Exception: pass # count # of getJob/updateJob in dispatcher's log try: # don't update when logrotate is running timeNow = datetime.datetime.utcnow() logRotateTime = timeNow.replace(hour=3, minute=2, second=0, microsecond=0) if (timeNow > logRotateTime and (timeNow-logRotateTime) < datetime.timedelta(minutes=5)) or \ (logRotateTime > timeNow and (logRotateTime-timeNow) < datetime.timedelta(minutes=5)): tmpLog.debug("skip pilotCounts session for logrotate") else: # log filename dispLogName = '%s/panda-PilotRequests.log' % panda_config.logdir # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta( hours=3) timeLimitS = datetime.datetime.utcnow() - datetime.timedelta( hours=1) # check if tgz is required com = 'head -1 %s' % dispLogName lostat, loout = commands_get_status_output(com) useLogTgz = True if lostat == 0: match = re.search('^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', loout) if match is not None: startTime = datetime.datetime(*time.strptime( match.group(0), '%Y-%m-%d %H:%M:%S')[:6]) # current log contains all info if startTime < timeLimit: useLogTgz = False # log files dispLogNameList = [dispLogName] if useLogTgz: today = datetime.date.today() dispLogNameList.append('{0}-{1}.gz'.format( dispLogName, today.strftime('%Y%m%d'))) # delete tmp commands_get_status_output('rm -f %s.tmp-*' % dispLogName) # tmp name tmpLogName = '%s.tmp-%s' % (dispLogName, datetime.datetime.utcnow( ).strftime('%Y-%m-%d-%H-%M-%S')) # loop over all files pilotCounts = {} pilotCountsS = {} for tmpDispLogName in dispLogNameList: # expand or copy if tmpDispLogName.endswith('.gz'): com = 'gunzip -c %s > %s' % (tmpDispLogName, tmpLogName) else: com = 'cp %s %s' % (tmpDispLogName, tmpLogName) lostat, loout = commands_get_status_output(com) if lostat != 0: errMsg = 'failed to expand/copy %s with : %s' % ( tmpDispLogName, loout) raise RuntimeError(errMsg) # search string sStr = '^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*' sStr += 'method=(.+),site=(.+),node=(.+),type=(.+)' # read logFH = open(tmpLogName) for line in logFH: # check format match = re.search(sStr, line) if match is not None: # check timerange timeStamp = datetime.datetime(*time.strptime( match.group(1), '%Y-%m-%d %H:%M:%S')[:6]) if timeStamp < timeLimit: continue tmpMethod = match.group(2) tmpSite = match.group(3) tmpNode = match.group(4) tmpType = match.group(5) # protection against corrupted entries from pilot, # e.g. pilot reading site json from cvmfs while it was being updated if tmpSite not in aSiteMapper.siteSpecList: continue # sum pilotCounts.setdefault(tmpSite, {}) pilotCounts[tmpSite].setdefault(tmpMethod, {}) pilotCounts[tmpSite][tmpMethod].setdefault(tmpNode, 0) pilotCounts[tmpSite][tmpMethod][tmpNode] += 1 # short if timeStamp > timeLimitS: if tmpSite not in pilotCountsS: pilotCountsS[tmpSite] = dict() if tmpMethod not in pilotCountsS[tmpSite]: pilotCountsS[tmpSite][tmpMethod] = dict() if tmpNode not in pilotCountsS[tmpSite][tmpMethod]: pilotCountsS[tmpSite][tmpMethod][tmpNode] = 0 pilotCountsS[tmpSite][tmpMethod][tmpNode] += 1 # close logFH.close() # delete tmp commands_get_status_output('rm %s' % tmpLogName) # update hostID = panda_config.pserverhost.split('.')[0] tmpLog.debug("pilotCounts session") retPC = taskBuffer.updateSiteData(hostID, pilotCounts, interval=3) tmpLog.debug(retPC) retPC = taskBuffer.updateSiteData(hostID, pilotCountsS, interval=1) tmpLog.debug(retPC) except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("updateJob/getJob : %s %s" % (errType, errValue)) # nRunning tmpLog.debug("nRunning session") try: if (currentMinute / panda_config.nrun_interval ) % panda_config.nrun_hosts == panda_config.nrun_snum: retNR = taskBuffer.insertnRunningInSiteData() tmpLog.debug(retNR) except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("nRunning : %s %s" % (errType, errValue)) # session for co-jumbo jobs tmpLog.debug("co-jumbo session") try: ret = taskBuffer.getCoJumboJobsToBeFinished(30, 0, 1000) if ret is None: tmpLog.debug("failed to get co-jumbo jobs to finish") else: coJumboA, coJumboD, coJumboW, coJumboTokill = ret tmpLog.debug("finish {0} co-jumbo jobs in Active".format( len(coJumboA))) if len(coJumboA) > 0: jobSpecs = taskBuffer.peekJobs(coJumboA, fromDefined=False, fromActive=True, fromArchived=False, fromWaiting=False) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], False) tmpLog.debug("finish {0} co-jumbo jobs in Defined".format( len(coJumboD))) if len(coJumboD) > 0: jobSpecs = taskBuffer.peekJobs(coJumboD, fromDefined=True, fromActive=False, fromArchived=False, fromWaiting=False) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], True) tmpLog.debug("finish {0} co-jumbo jobs in Waiting".format( len(coJumboW))) if len(coJumboW) > 0: jobSpecs = taskBuffer.peekJobs(coJumboW, fromDefined=False, fromActive=False, fromArchived=False, fromWaiting=True) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], False, True) tmpLog.debug("kill {0} co-jumbo jobs in Waiting".format( len(coJumboTokill))) if len(coJumboTokill) > 0: jediJobs = list(coJumboTokill) nJob = 100 iJob = 0 while iJob < len(jediJobs): tmpLog.debug(' killing %s' % str(jediJobs[iJob:iJob + nJob])) Client.killJobs(jediJobs[iJob:iJob + nJob], 51, keepUnmerged=True) iJob += nJob except Exception: errStr = traceback.format_exc() tmpLog.error(errStr) tmpLog.debug("Fork session") # thread for fork class ForkThr(threading.Thread): def __init__(self, fileName): threading.Thread.__init__(self) self.fileName = fileName def run(self): if 'VIRTUAL_ENV' in os.environ: prefix = os.environ['VIRTUAL_ENV'] else: prefix = '' setupStr = 'source {0}/etc/sysconfig/panda_server; '.format(prefix) runStr = '%s/python -Wignore ' % panda_config.native_python runStr += panda_config.pandaPython_dir + '/dataservice/forkSetupper.py -i ' runStr += self.fileName if self.fileName.split('/')[-1].startswith('set.NULL.'): runStr += ' -t' comStr = setupStr + runStr tmpLog.debug(comStr) commands_get_status_output(comStr) # get set.* files filePatt = panda_config.logdir + '/' + 'set.*' fileList = glob.glob(filePatt) # the max number of threads maxThr = 10 nThr = 0 # loop over all files forkThrList = [] timeNow = datetime.datetime.utcnow() for tmpName in fileList: if not os.path.exists(tmpName): continue try: # takes care of only recent files modTime = datetime.datetime( *(time.gmtime(os.path.getmtime(tmpName))[:7])) if (timeNow - modTime) > datetime.timedelta(minutes=1) and \ (timeNow - modTime) < datetime.timedelta(hours=1): cSt, cOut = commands_get_status_output( 'ps aux | grep fork | grep -v PYTH') # if no process is running for the file if cSt == 0 and tmpName not in cOut: nThr += 1 thr = ForkThr(tmpName) thr.start() forkThrList.append(thr) if nThr > maxThr: break except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("%s %s" % (errType, errValue)) # join fork threads for thr in forkThrList: thr.join() # terminate TaskBuffer IF # taskBufferIF.terminate() tmpLog.debug("===================== end =====================")
import datetime from pandaserver.taskbuffer.TaskBuffer import taskBuffer from pandacommon.pandalogger.PandaLogger import PandaLogger from pandacommon.pandalogger.LogWrapper import LogWrapper from pandaserver.brokerage.SiteMapper import SiteMapper from pandaserver.taskbuffer import ErrorCode # password from pandaserver.config import panda_config # logger _logger = PandaLogger().getLogger('esPreemption') tmpLog = LogWrapper(_logger) tmpLog.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=15) # get low priority ES jobs per site sqlEsJobs = "SELECT PandaID,computingSite,commandToPilot,startTime " sqlEsJobs += "FROM {0}.jobsActive4 ".format(panda_config.schemaPANDA) sqlEsJobs += "WHERE prodSourceLabel IN (:label1,:label2) AND eventService=:es " sqlEsJobs += "AND currentPriority<:prio AND jobStatus=:jobStat " sqlEsJobs += "ORDER BY currentPriority,PandaID "
def uploadLog(req, file): if not Protocol.isSecure(req): return False if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']: return False tmpLog = LogWrapper(_logger, 'uploadLog <{0}>'.format(file.filename)) tmpLog.debug("start {0}".format(req.subprocess_env['SSL_CLIENT_S_DN'])) # size check sizeLimit = 100 * 1024 * 1024 # get file size contentLength = 0 try: contentLength = long(req.headers_in["content-length"]) except Exception: if "content-length" in req.headers_in: tmpLog.error("cannot get CL : %s" % req.headers_in["content-length"]) else: tmpLog.error("no CL") tmpLog.debug("size %s" % contentLength) if contentLength > sizeLimit: errStr = "failed to upload log due to size limit" tmpLog.error(errStr) tmpLog.debug("end") return errStr jediLogDir = '/jedilog' retStr = '' try: fileBaseName = file.filename.split('/')[-1] fileFullPath = '{0}{1}/{2}'.format(panda_config.cache_dir, jediLogDir, fileBaseName) # delete old file if os.path.exists(fileFullPath): os.remove(fileFullPath) # write fo = open(fileFullPath, 'wb') fileContent = file.file.read() fo.write(fileContent) fo.close() tmpLog.debug("written to {0}".format(fileFullPath)) retStr = 'http://{0}/cache{1}/{2}'.format(getServerHTTP(None), jediLogDir, fileBaseName) except Exception: errtype, errvalue = sys.exc_info()[:2] errStr = "failed to write log with {0}:{1}".format( errtype.__name__, errvalue) tmpLog.error(errStr) tmpLog.debug("end") return errStr tmpLog.debug("end") return retStr
class AdderGen(object): # constructor def __init__(self, taskBuffer, jobID, jobStatus, attemptNr, ignoreTmpError=True, siteMapper=None, pid=None, prelock_pid=None, lock_offset=10): self.job = None self.jobID = jobID self.jobStatus = jobStatus self.taskBuffer = taskBuffer self.ignoreTmpError = ignoreTmpError self.lock_offset = lock_offset self.siteMapper = siteMapper self.datasetMap = {} self.extraInfo = { 'surl': {}, 'nevents': {}, 'lbnr': {}, 'endpoint': {}, 'guid': {} } self.attemptNr = attemptNr self.pid = pid self.prelock_pid = prelock_pid self.data = None # logger self.logger = LogWrapper(_logger, str(self.jobID)) # dump file report def dumpFileReport(self, fileCatalog, attemptNr): self.logger.debug("dump file report") # dump Catalog into file # if attemptNr is None: # xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,self.jobID,self.jobStatus, # str(uuid.uuid4())) # else: # xmlFile = '%s/%s_%s_%s_%s' % (panda_config.logdir,self.jobID,self.jobStatus, # str(uuid.uuid4()),attemptNr) # file = open(xmlFile,'w') # file.write(fileCatalog) # file.close() # dump Catalog into job output report table attempt_nr = 0 if attemptNr is None else attemptNr if self.job is None: self.job = self.taskBuffer.peekJobs([self.jobID], fromDefined=False, fromWaiting=False, forAnal=True)[0] if self.job: self.taskBuffer.insertJobOutputReport( panda_id=self.jobID, prod_source_label=self.job.prodSourceLabel, job_status=self.jobStatus, attempt_nr=attempt_nr, data=fileCatalog) # get plugin class def getPluginClass(self, tmpVO, tmpGroup): # instantiate concrete plugin adderPluginClass = panda_config.getPlugin('adder_plugins', tmpVO, tmpGroup) if adderPluginClass is None: # use ATLAS plugin by default from pandaserver.dataservice.AdderAtlasPlugin import AdderAtlasPlugin adderPluginClass = AdderAtlasPlugin self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__)) return adderPluginClass # main def run(self): try: self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus, self.attemptNr)) # got lock, get the report report_dict = self.taskBuffer.getJobOutputReport( panda_id=self.jobID, attempt_nr=self.attemptNr) self.data = report_dict.get('data') # query job self.job = self.taskBuffer.peekJobs([self.jobID], fromDefined=False, fromWaiting=False, forAnal=True)[0] # check if job has finished if self.job is None: self.logger.debug(': job not found in DB') elif self.job.jobStatus in [ 'finished', 'failed', 'unknown', 'merging' ]: self.logger.error(': invalid state -> %s' % self.job.jobStatus) elif self.attemptNr is not None and self.job.attemptNr != self.attemptNr: self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr, self.attemptNr)) # elif self.attemptNr is not None and self.job.jobStatus == 'transferring': # errMsg = 'XML with attemptNr for {0}'.format(self.job.jobStatus) # self.logger.error(errMsg) elif self.jobStatus == EventServiceUtils.esRegStatus: # instantiate concrete plugin adderPluginClass = self.getPluginClass(self.job.VO, self.job.cloud) adderPlugin = adderPluginClass(self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, logger=self.logger) # execute self.logger.debug('plugin is ready for ES file registration') adderPlugin.registerEventServiceFiles() else: # check file status in JEDI if not self.job.isCancelled() and self.job.taskBufferErrorCode not in \ [pandaserver.taskbuffer.ErrorCode.EC_PilotRetried]: fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI( self.job) self.logger.debug("check file status in JEDI : {0}".format( fileCheckInJEDI)) if fileCheckInJEDI is None: raise RuntimeError( 'failed to check file status in JEDI') if fileCheckInJEDI is False: # set job status to failed since some file status is wrong in JEDI self.jobStatus = 'failed' self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder errStr = "inconsistent file status between Panda and JEDI. " errStr += "failed to avoid duplicated processing caused by synchronization failure" self.job.ddmErrorDiag = errStr self.logger.debug( "set jobStatus={0} since input is inconsistent between Panda and JEDI" .format(self.jobStatus)) elif self.job.jobSubStatus in ['pilot_closed']: # terminated by the pilot self.logger.debug( "going to closed since terminated by the pilot") retClosed = self.taskBuffer.killJobs([self.jobID], 'pilot', '60', True) if retClosed[0] is True: self.logger.debug("end") # remove Catalog self.taskBuffer.deleteJobOutputReport( panda_id=self.jobID, attempt_nr=self.attemptNr) return # check for cloned jobs if EventServiceUtils.isJobCloningJob(self.job): checkJC = self.taskBuffer.checkClonedJob(self.job) if checkJC is None: raise RuntimeError( 'failed to check the cloned job') # failed to lock semaphore if checkJC['lock'] is False: self.jobStatus = 'failed' self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder self.job.ddmErrorDiag = "failed to lock semaphore for job cloning" self.logger.debug( "set jobStatus={0} since did not get semaphore for job cloning" .format(self.jobStatus)) # use failed for cancelled/closed jobs if self.job.isCancelled(): self.jobStatus = 'failed' # reset error codes to skip retrial module self.job.pilotErrorCode = 0 self.job.exeErrorCode = 0 self.job.ddmErrorCode = 0 # keep old status oldJobStatus = self.job.jobStatus # set job status if self.job.jobStatus not in ['transferring']: self.job.jobStatus = self.jobStatus addResult = None adderPlugin = None # parse XML parseResult = self.parseXML() if parseResult < 2: # interaction with DDM try: # instantiate concrete plugin adderPluginClass = self.getPluginClass( self.job.VO, self.job.cloud) adderPlugin = adderPluginClass( self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, extraInfo=self.extraInfo, logger=self.logger) # execute self.logger.debug('plugin is ready') adderPlugin.execute() addResult = adderPlugin.result self.logger.debug('plugin done with %s' % (addResult.statusCode)) except Exception: errtype, errvalue = sys.exc_info()[:2] self.logger.error( "failed to execute AdderPlugin for VO={0} with {1}:{2}" .format(self.job.VO, errtype, errvalue)) self.logger.error( "failed to execute AdderPlugin for VO={0} with {1}" .format(self.job.VO, traceback.format_exc())) addResult = None self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder self.job.ddmErrorDiag = "AdderPlugin failure" # ignore temporary errors if self.ignoreTmpError and addResult is not None and addResult.isTemporary( ): self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag) self.logger.debug('escape') # unlock job output report self.taskBuffer.unlockJobOutputReport( panda_id=self.jobID, attempt_nr=self.attemptNr, pid=self.pid, lock_offset=self.lock_offset) return # failed if addResult is None or not addResult.isSucceeded(): self.job.jobStatus = 'failed' # set file status for failed jobs or failed transferring jobs self.logger.debug( "status after plugin call :job.jobStatus=%s jobStatus=%s" % (self.job.jobStatus, self.jobStatus)) if self.job.jobStatus == 'failed' or self.jobStatus == 'failed': # First of all: check if job failed and in this case take first actions according to error table source, error_code, error_diag = None, None, None errors = [] if self.job.pilotErrorCode: source = 'pilotErrorCode' error_code = self.job.pilotErrorCode error_diag = self.job.pilotErrorDiag errors.append({ 'source': source, 'error_code': error_code, 'error_diag': error_diag }) if self.job.exeErrorCode: source = 'exeErrorCode' error_code = self.job.exeErrorCode error_diag = self.job.exeErrorDiag errors.append({ 'source': source, 'error_code': error_code, 'error_diag': error_diag }) if self.job.ddmErrorCode: source = 'ddmErrorCode' error_code = self.job.ddmErrorCode error_diag = self.job.ddmErrorDiag errors.append({ 'source': source, 'error_code': error_code, 'error_diag': error_diag }) if self.job.transExitCode: source = 'transExitCode' error_code = self.job.transExitCode error_diag = '' errors.append({ 'source': source, 'error_code': error_code, 'error_diag': error_diag }) # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag)) if source and error_code: try: self.logger.debug( "AdderGen.run will call apply_retrial_rules") retryModule.apply_retrial_rules( self.taskBuffer, self.job.PandaID, errors, self.job.attemptNr) self.logger.debug("apply_retrial_rules is back") except Exception as e: self.logger.error( "apply_retrial_rules excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) self.job.jobStatus = 'failed' for file in self.job.Files: if file.type in ['output', 'log']: if addResult is not None and file.lfn in addResult.mergingFiles: file.status = 'merging' else: file.status = 'failed' else: # reset errors self.job.jobDispatcherErrorCode = 0 self.job.jobDispatcherErrorDiag = 'NULL' # set status if addResult is not None and addResult.mergingFiles != []: # set status for merging: for file in self.job.Files: if file.lfn in addResult.mergingFiles: file.status = 'merging' self.job.jobStatus = 'merging' # propagate transition to prodDB self.job.stateChangeTime = time.strftime( '%Y-%m-%d %H:%M:%S', time.gmtime()) elif addResult is not None and addResult.transferringFiles != []: # set status for transferring for file in self.job.Files: if file.lfn in addResult.transferringFiles: file.status = 'transferring' self.job.jobStatus = 'transferring' self.job.jobSubStatus = None # propagate transition to prodDB self.job.stateChangeTime = time.strftime( '%Y-%m-%d %H:%M:%S', time.gmtime()) else: self.job.jobStatus = 'finished' # endtime if self.job.endTime == 'NULL': self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime()) # output size and # of outputs self.job.nOutputDataFiles = 0 self.job.outputFileBytes = 0 for tmpFile in self.job.Files: if tmpFile.type == 'output': self.job.nOutputDataFiles += 1 try: self.job.outputFileBytes += tmpFile.fsize except Exception: pass # protection maxOutputFileBytes = 99999999999 if self.job.outputFileBytes > maxOutputFileBytes: self.job.outputFileBytes = maxOutputFileBytes # set cancelled state if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed': self.job.jobStatus = 'cancelled' # update job if oldJobStatus in ['cancelled', 'closed']: pass else: self.logger.debug("updating DB") retU = self.taskBuffer.updateJobs( [self.job], False, oldJobStatusList=[oldJobStatus], extraInfo=self.extraInfo) self.logger.debug("retU: %s" % retU) # failed if not retU[0]: self.logger.error( 'failed to update DB for pandaid={0}'.format( self.job.PandaID)) # unlock job output report self.taskBuffer.unlockJobOutputReport( panda_id=self.jobID, attempt_nr=self.attemptNr, pid=self.pid, lock_offset=self.lock_offset) return try: # updateJobs was successful and it failed a job with taskBufferErrorCode self.logger.debug("AdderGen.run will peek the job") job_tmp = self.taskBuffer.peekJobs( [self.job.PandaID], fromDefined=False, fromArchived=True, fromWaiting=False)[0] self.logger.debug( "status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}" .format(job_tmp.jobStatus, job_tmp.taskBufferErrorCode, job_tmp.taskBufferErrorDiag)) if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode: source = 'taskBufferErrorCode' error_code = job_tmp.taskBufferErrorCode error_diag = job_tmp.taskBufferErrorDiag errors = [{ 'source': source, 'error_code': error_code, 'error_diag': error_diag }] self.logger.debug( "AdderGen.run 2 will call apply_retrial_rules") retryModule.apply_retrial_rules( self.taskBuffer, job_tmp.PandaID, errors, job_tmp.attemptNr) self.logger.debug("apply_retrial_rules 2 is back") except IndexError: pass except Exception as e: self.logger.error( "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) # setup for closer if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.isCancelled()): destDBList = [] guidList = [] for file in self.job.Files: # ignore inputs if file.type == 'input': continue # skip pseudo datasets if file.destinationDBlock in ['', None, 'NULL']: continue # start closer for output/log datasets if file.destinationDBlock not in destDBList: destDBList.append(file.destinationDBlock) # collect GUIDs if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['rucio_test'] + JobUtils.list_ptest_prod_sources and \ self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \ and file.type == 'output': # extract base LFN since LFN was changed to full LFN for CMS baseLFN = file.lfn.split('/')[-1] guidList.append({ 'lfn': baseLFN, 'guid': file.GUID, 'type': file.type, 'checksum': file.checksum, 'md5sum': file.md5sum, 'fsize': file.fsize, 'scope': file.scope }) if guidList != []: retG = self.taskBuffer.setGUIDs(guidList) if destDBList != []: # start Closer if adderPlugin is not None and hasattr( adderPlugin, 'datasetMap' ) and adderPlugin.datasetMap != {}: cThr = Closer.Closer( self.taskBuffer, destDBList, self.job, datasetMap=adderPlugin.datasetMap) else: cThr = Closer.Closer(self.taskBuffer, destDBList, self.job) self.logger.debug("start Closer") # cThr.start() # cThr.join() cThr.run() del cThr self.logger.debug("end Closer") # run closer for assocaiate parallel jobs if EventServiceUtils.isJobCloningJob(self.job): assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer( self.job.jediTaskID, self.job.PandaID, destDBList) for assJobID in assDBlockMap: assDBlocks = assDBlockMap[assJobID] assJob = self.taskBuffer.peekJobs( [assJobID], fromDefined=False, fromArchived=False, fromWaiting=False, forAnal=True)[0] if self.job is None: self.logger.debug( ': associated job PandaID={0} not found in DB' .format(assJobID)) else: cThr = Closer.Closer( self.taskBuffer, assDBlocks, assJob) self.logger.debug( "start Closer for PandaID={0}".format( assJobID)) # cThr.start() # cThr.join() cThr.run() del cThr self.logger.debug( "end Closer for PandaID={0}".format( assJobID)) self.logger.debug("end") # try: # # remove Catalog # os.remove(self.xmlFile) # except Exception: # pass # remove Catalog self.taskBuffer.deleteJobOutputReport(panda_id=self.jobID, attempt_nr=self.attemptNr) del self.data del report_dict except Exception as e: errStr = ": {} {}".format(str(e), traceback.format_exc()) self.logger.error(errStr) self.logger.error("except") # unlock job output report self.taskBuffer.unlockJobOutputReport(panda_id=self.jobID, attempt_nr=self.attemptNr, pid=self.pid, lock_offset=self.lock_offset) # parse XML # 0: succeeded, 1: harmless error to exit, 2: fatal error, 3: event service def parseXML(self): # get LFN and GUID # self.logger.debug('XML filename : %s' % self.xmlFile) # no outputs log_out = [f for f in self.job.Files if f.type in ['log', 'output']] if not log_out: self.logger.debug("has no outputs") self.logger.debug("parseXML end") return 0 # get input files inputLFNs = [] for file in self.job.Files: if file.type == 'input': inputLFNs.append(file.lfn) # parse XML lfns = [] guids = [] fsizes = [] md5sums = [] chksums = [] surls = [] fullLfnMap = {} nEventsMap = {} guidMap = dict() try: # root = xml.dom.minidom.parse(self.xmlFile) root = xml.dom.minidom.parseString(self.data) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) # get metadata fsize = None md5sum = None adler32 = None surl = None fullLFN = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'fsize': fsize = long(meta.getAttribute('att_value')) elif name == 'md5sum': md5sum = str(meta.getAttribute('att_value')) # check if re.search("^[a-fA-F0-9]{32}$", md5sum) is None: md5sum = None elif name == 'adler32': adler32 = str(meta.getAttribute('att_value')) elif name == 'surl': surl = str(meta.getAttribute('att_value')) elif name == 'full_lfn': fullLFN = str(meta.getAttribute('att_value')) # endpoints self.extraInfo['endpoint'][lfn] = [] for epNode in file.getElementsByTagName('endpoint'): self.extraInfo['endpoint'][lfn].append( str(epNode.firstChild.data)) # error check if (lfn not in inputLFNs) and (fsize is None or (md5sum is None and adler32 is None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError('fsize/md5sum/adler32/surl=None') # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 is not None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN is not None: fullLfnMap[lfn] = fullLFN except Exception: # parse json try: import json # with open(self.xmlFile) as tmpF: jsonDict = json.loads(self.data) for lfn in jsonDict: fileData = jsonDict[lfn] lfn = str(lfn) fsize = None md5sum = None adler32 = None surl = None fullLFN = None guid = str(fileData['guid']) if 'fsize' in fileData: fsize = long(fileData['fsize']) if 'md5sum' in fileData: md5sum = str(fileData['md5sum']) # check if re.search("^[a-fA-F0-9]{32}$", md5sum) is None: md5sum = None if 'adler32' in fileData: adler32 = str(fileData['adler32']) if 'surl' in fileData: surl = str(fileData['surl']) if 'full_lfn' in fileData: fullLFN = str(fileData['full_lfn']) # endpoints self.extraInfo['endpoint'][lfn] = [] if 'endpoint' in fileData: self.extraInfo['endpoint'][lfn] = fileData['endpoint'] # error check if (lfn not in inputLFNs) and (fsize is None or (md5sum is None and adler32 is None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError( 'fsize/md5sum/adler32/surl=None') # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 is not None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN is not None: fullLfnMap[lfn] = fullLFN except Exception: # check if file exists # if os.path.exists(self.xmlFile): if True: type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type, value)) # set failed anyway self.job.jobStatus = 'failed' # XML error happens when pilot got killed due to wall-time limit or failures in wrapper if (self.job.pilotErrorCode in [0,'0','NULL']) and \ (self.job.taskBufferErrorCode not in [pandaserver.taskbuffer.ErrorCode.EC_WorkerDone]) and \ (self.job.transExitCode in [0,'0','NULL']): self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML" return 2 else: # XML was deleted return 1 # parse metadata to get nEvents nEventsFrom = None try: root = xml.dom.minidom.parseString(self.job.metadata) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) guidMap[lfn] = guid # get metadata nevents = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'events': nevents = long(meta.getAttribute('att_value')) nEventsMap[lfn] = nevents break nEventsFrom = "xml" except Exception: pass # parse json try: import json jsonDict = json.loads(self.job.metadata) for jsonFileItem in jsonDict['files']['output']: for jsonSubFileItem in jsonFileItem['subFiles']: lfn = str(jsonSubFileItem['name']) try: nevents = long(jsonSubFileItem['nentries']) nEventsMap[lfn] = nevents except Exception: pass try: guid = str(jsonSubFileItem['file_guid']) guidMap[lfn] = guid except Exception: pass nEventsFrom = "json" except Exception: pass # use nEvents and GUIDs reported by the pilot if no job report if self.job.metadata == 'NULL' and self.jobStatus == 'finished' and self.job.nEvents > 0 \ and self.job.prodSourceLabel in ['managed']: for file in self.job.Files: if file.type == 'output': nEventsMap[file.lfn] = self.job.nEvents for lfn, guid in zip(lfns, guids): guidMap[lfn] = guid nEventsFrom = "pilot" self.logger.debug('nEventsMap=%s' % str(nEventsMap)) self.logger.debug('nEventsFrom=%s' % str(nEventsFrom)) self.logger.debug('guidMap=%s' % str(guidMap)) self.logger.debug('self.job.jobStatus=%s in parseXML' % self.job.jobStatus) self.logger.debug( 'isES=%s isJumbo=%s' % (EventServiceUtils.isEventServiceJob( self.job), EventServiceUtils.isJumboJob(self.job))) # get lumi block number lumiBlockNr = self.job.getLumiBlockNr() # copy files for variable number of outputs tmpStat = self.copyFilesForVariableNumOutputs(lfns) if not tmpStat: self.logger.error( "failed to copy files for variable number of outputs") return 2 # check files fileList = [] for file in self.job.Files: fileList.append(file.lfn) if file.type == 'input': if file.lfn in lfns: if self.job.prodSourceLabel in ['user', 'panda']: # skipped file file.status = 'skipped' elif self.job.prodSourceLabel in [ 'managed', 'test' ] + JobUtils.list_ptest_prod_sources: # failed by pilot file.status = 'failed' elif file.type == 'output' or file.type == 'log': # add only log file for failed jobs if self.jobStatus == 'failed' and file.type != 'log': file.status = 'failed' continue # set failed if it is missing in XML if file.lfn not in lfns: if (self.job.jobStatus == 'finished' and EventServiceUtils.isEventServiceJob(self.job)) \ or EventServiceUtils.isJumboJob(self.job): # unset file status for ES jobs pass elif file.isAllowedNoOutput(): # allowed not to be produced file.status = 'nooutput' self.logger.debug('set {0} to status={1}'.format( file.lfn, file.status)) else: file.status = 'failed' self.job.jobStatus = 'failed' self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format( file.lfn) self.logger.error(self.job.ddmErrorDiag) continue # look for GUID with LFN try: i = lfns.index(file.lfn) file.GUID = guids[i] file.fsize = fsizes[i] file.md5sum = md5sums[i] file.checksum = chksums[i] surl = surls[i] # status file.status = 'ready' # change to full LFN if file.lfn in fullLfnMap: file.lfn = fullLfnMap[file.lfn] # add SURL to extraInfo self.extraInfo['surl'][file.lfn] = surl # add nevents if file.lfn in nEventsMap: self.extraInfo['nevents'][file.lfn] = nEventsMap[ file.lfn] except Exception: # status file.status = 'failed' type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type, value)) # set lumi block number if lumiBlockNr is not None and file.status != 'failed': self.extraInfo['lbnr'][file.lfn] = lumiBlockNr self.extraInfo['guid'] = guidMap # check consistency between XML and filesTable for lfn in lfns: if lfn not in fileList: self.logger.error("%s is not found in filesTable" % lfn) self.job.jobStatus = 'failed' for tmpFile in self.job.Files: tmpFile.status = 'failed' self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format( lfn) return 2 # return self.logger.debug("parseXML end") return 0 # copy files for variable number of outputs def copyFilesForVariableNumOutputs(self, lfns): # get original output files origOutputs = {} updateOrig = {} for tmpFile in self.job.Files: if tmpFile.type in ['output', 'log']: origOutputs[tmpFile.lfn] = tmpFile if tmpFile.lfn in lfns: # keep original updateOrig[tmpFile.lfn] = False else: # overwrite original updateOrig[tmpFile.lfn] = True # look for unkown files addedNewFiles = False for newLFN in lfns: if newLFN not in origOutputs: # look for corresponding original output for origLFN in origOutputs: tmpPatt = '^{0}\.*_\d+$'.format(origLFN) if re.search(tmpPatt, newLFN) is not None: # copy file record tmpStat = self.taskBuffer.copyFileRecord( newLFN, origOutputs[origLFN], updateOrig[origLFN]) if not tmpStat: return False addedNewFiles = True # disable further overwriting updateOrig[origLFN] = False break # refresh job info if addedNewFiles: self.job = self.taskBuffer.peekJobs([self.jobID], fromDefined=False, fromWaiting=False, forAnal=True)[0] # return return True
def core_exec(sandbox_url, log_token, dump_workflow, ops_file, user_name, test_mode): tmpLog = LogWrapper(_logger, log_token) is_OK = True is_fatal = False request_id = None if dump_workflow == 'True': dump_workflow = True else: dump_workflow = False if test_mode == 'True': test_mode = True else: test_mode = False try: with open(ops_file) as f: ops = json.load(f) try: os.remove(ops_file) except Exception: pass # go to temp dir cur_dir = os.getcwd() with tempfile.TemporaryDirectory() as tmp_dirname: os.chdir(tmp_dirname) # download sandbox tmpLog.info('downloading sandbox from {}'.format(sandbox_url)) with requests.get(sandbox_url, allow_redirects=True, verify=False, stream=True) as r: if r.status_code == 400: tmpLog.error("not found") is_fatal = True is_OK = False elif r.status_code != 200: tmpLog.error("bad HTTP response {}".format(r.status_code)) is_OK = False # extract sandbox if is_OK: with open(ops['data']['sandbox'], 'wb') as fs: for chunk in r.raw.stream(1024, decode_content=False): if chunk: fs.write(chunk) fs.close() tmp_stat, tmp_out = commands_get_status_output( 'tar xvfz {}'.format(ops['data']['sandbox'])) if tmp_stat != 0: tmpLog.error(tmp_out) dump_str = 'failed to extract {}'.format(ops['data']['sandbox']) tmpLog.error(dump_str) is_fatal = True is_OK = False # parse workflow files if is_OK: tmpLog.info('parse workflow') if ops['data']['language'] == 'cwl': nodes, root_in = pcwl_utils.parse_workflow_file(ops['data']['workflowSpecFile'], tmpLog) with open(ops['data']['workflowInputFile']) as workflow_input: data = yaml.safe_load(workflow_input) s_id, t_nodes, nodes = pcwl_utils.resolve_nodes(nodes, root_in, data, 0, set(), ops['data']['outDS'], tmpLog) workflow_utils.set_workflow_outputs(nodes) id_node_map = workflow_utils.get_node_id_map(nodes) [node.resolve_params(ops['data']['taskParams'], id_node_map) for node in nodes] dump_str = "the description was internally converted as follows\n" \ + workflow_utils.dump_nodes(nodes) tmpLog.info(dump_str) for node in nodes: s_check, o_check = node.verify() tmp_str = 'Verification failure in ID:{} {}'.format(node.id, o_check) if not s_check: tmpLog.error(tmp_str) dump_str += tmp_str dump_str += '\n' is_fatal = True is_OK = False else: dump_str = "{} is not supported to describe the workflow" tmpLog.error(dump_str) is_fatal = True is_OK = False # convert to workflow if is_OK: workflow_to_submit, dump_str_list = workflow_utils.convert_nodes_to_workflow(nodes) try: if workflow_to_submit: if not test_mode: tmpLog.info('submit workflow') wm = ClientManager(host=get_rest_host()) request_id = wm.submit(workflow_to_submit, username=user_name) else: dump_str = 'workflow is empty' tmpLog.error(dump_str) is_fatal = True is_OK = False except Exception as e: dump_str = 'failed to submit the workflow with {}'.format(str(e)) tmpLog.error('{} {}'.format(dump_str, traceback.format_exc())) if dump_workflow: tmpLog.debug('\n' + ''.join(dump_str_list)) os.chdir(cur_dir) except Exception as e: is_OK = False is_fatal = True tmpLog.error("failed to run with {} {}".format(str(e), traceback.format_exc())) with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_json: json.dump([is_OK, is_fatal, request_id, tmpLog.dumpToString()], tmp_json) print(tmp_json.name) sys.exit(0)
def process(self, file_name, to_delete=False, test_mode=False, get_log=False, dump_workflow=False): try: is_fatal = False is_OK = True request_id = None dump_str = None with open(file_name) as f: ops = json.load(f) user_name = clean_user_id(ops["userName"]) base_platform = ops['data'].get('base_platform') for task_type in ops['data']['taskParams']: ops['data']['taskParams'][task_type]['userName'] = user_name if base_platform: ops['data']['taskParams'][task_type]['basePlatform'] = base_platform log_token = '< id="{}" test={} outDS={} >'.format(user_name, test_mode, ops['data']['outDS']) tmpLog = LogWrapper(self.log, log_token) tmpLog.info('start {}'.format(file_name)) sandbox_url = os.path.join(ops['data']['sourceURL'], 'cache', ops['data']['sandbox']) # IO through json files ops_file = tempfile.NamedTemporaryFile(delete=False, mode='w') json.dump(ops, ops_file) ops_file.close() # execute main in another process to avoid chdir mess tmp_stat, tmp_out = commands_get_status_output("python {} {} '{}' {} {} '{}' {}".format( __file__, sandbox_url, log_token, dump_workflow, ops_file.name, user_name, test_mode)) if tmp_stat: is_OK = False tmpLog.error('main execution failed with {}:{}'.format(tmp_stat, tmp_out)) else: with open(tmp_out.split('\n')[-1]) as tmp_out_file: is_OK, is_fatal, request_id, dump_str = json.load(tmp_out_file) try: os.remove(tmp_out) except Exception: pass if not get_log: if is_OK: tmpLog.info('is_OK={} request_id={}'.format(is_OK, request_id)) else: tmpLog.info('is_OK={} is_fatal={} request_id={}'.format(is_OK, is_fatal, request_id)) if to_delete or (not test_mode and (is_OK or is_fatal)): dump_str = tmpLog.dumpToString() + dump_str tmpLog.debug('delete {}'.format(file_name)) try: os.remove(file_name) except Exception: pass # send notification if not test_mode and self.taskBuffer is not None: toAdder = self.taskBuffer.getEmailAddr(user_name) if toAdder is None or toAdder.startswith('notsend'): tmpLog.debug('skip to send notification since suppressed') else: # message if is_OK: mailSubject = "PANDA Notification for Workflow {}".format(ops['data']['outDS']) mailBody = "Hello,\n\nWorkflow:{} has been accepted with RequestID:{}\n\n".\ format(ops['data']['outDS'], request_id) else: mailSubject = "PANDA WARNING for Workflow={}".format(ops['data']['outDS']) mailBody = "Hello,\n\nWorkflow {} was not accepted\n\n".\ format(ops['data']['outDS'], request_id) mailBody += "Reason : %s\n" % dump_str # send tmpSM = MailUtils().send(toAdder, mailSubject, mailBody) tmpLog.debug('sent message with {}'.format(tmpSM)) except Exception as e: is_OK = False tmpLog.error("failed to run with {} {}".format(str(e), traceback.format_exc())) if get_log: ret_val = {'status': is_OK} if is_OK: ret_val['log'] = dump_str else: if dump_str is None: ret_val['log'] = tmpLog.dumpToString() else: ret_val['log'] = dump_str return ret_val
class EventPicker: # constructor def __init__(self, taskBuffer, siteMapper, evpFileName, ignoreError): self.taskBuffer = taskBuffer self.siteMapper = siteMapper self.ignoreError = ignoreError self.evpFileName = evpFileName self.token = datetime.datetime.utcnow().isoformat(' ') # logger self.logger = LogWrapper(_logger, self.token) self.pd2p = DynDataDistributer.DynDataDistributer([], self.taskBuffer, self.siteMapper, token=' ', logger=self.logger) self.userDatasetName = '' self.creationTime = '' self.params = '' self.lockedBy = '' self.evpFile = None self.userTaskName = '' # message buffer self.msgBuffer = [] self.lineLimit = 100 # JEDI self.jediTaskID = None self.prodSourceLabel = None self.job_label = None # main def run(self): try: self.putLog('start %s' % self.evpFileName) # lock evp file self.evpFile = open(self.evpFileName) try: fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) except Exception: # relase self.putLog("cannot lock %s" % self.evpFileName) self.evpFile.close() return True # options runEvtList = [] eventPickDataType = '' eventPickStreamName = '' eventPickDS = [] eventPickAmiTag = '' eventPickNumSites = 1 inputFileList = [] tagDsList = [] tagQuery = '' tagStreamRef = '' skipDaTRI = False runEvtGuidMap = {} ei_api = '' # read evp file for tmpLine in self.evpFile: tmpMatch = re.search('^([^=]+)=(.+)$', tmpLine) # check format if tmpMatch is None: continue tmpItems = tmpMatch.groups() if tmpItems[0] == 'runEvent': # get run and event number tmpRunEvt = tmpItems[1].split(',') if len(tmpRunEvt) == 2: runEvtList.append(tmpRunEvt) elif tmpItems[0] == 'eventPickDataType': # data type eventPickDataType = tmpItems[1] elif tmpItems[0] == 'eventPickStreamName': # stream name eventPickStreamName = tmpItems[1] elif tmpItems[0] == 'eventPickDS': # dataset pattern eventPickDS = tmpItems[1].split(',') elif tmpItems[0] == 'eventPickAmiTag': # AMI tag eventPickAmiTag = tmpItems[1] elif tmpItems[0] == 'eventPickNumSites': # the number of sites where datasets are distributed try: eventPickNumSites = int(tmpItems[1]) except Exception: pass elif tmpItems[0] == 'userName': # user name self.userDN = tmpItems[1] self.putLog("user=%s" % self.userDN) elif tmpItems[0] == 'userTaskName': # user task name self.userTaskName = tmpItems[1] elif tmpItems[0] == 'userDatasetName': # user dataset name self.userDatasetName = tmpItems[1] elif tmpItems[0] == 'lockedBy': # client name self.lockedBy = tmpItems[1] elif tmpItems[0] == 'creationTime': # creation time self.creationTime = tmpItems[1] elif tmpItems[0] == 'params': # parameters self.params = tmpItems[1] elif tmpItems[0] == 'ei_api': # ei api parameter for MC ei_api = tmpItems[1] elif tmpItems[0] == 'inputFileList': # input file list inputFileList = tmpItems[1].split(',') try: inputFileList.remove('') except Exception: pass elif tmpItems[0] == 'tagDS': # TAG dataset tagDsList = tmpItems[1].split(',') elif tmpItems[0] == 'tagQuery': # query for TAG tagQuery = tmpItems[1] elif tmpItems[0] == 'tagStreamRef': # StreamRef for TAG tagStreamRef = tmpItems[1] if not tagStreamRef.endswith('_ref'): tagStreamRef += '_ref' elif tmpItems[0] == 'runEvtGuidMap': # GUIDs try: runEvtGuidMap = eval(tmpItems[1]) except Exception: pass # extract task name if self.userTaskName == '' and self.params != '': try: tmpMatch = re.search('--outDS(=| ) *([^ ]+)', self.params) if tmpMatch is not None: self.userTaskName = tmpMatch.group(2) if not self.userTaskName.endswith('/'): self.userTaskName += '/' except Exception: pass # suppress DaTRI if self.params != '': if '--eventPickSkipDaTRI' in self.params: skipDaTRI = True # get compact user name compactDN = self.taskBuffer.cleanUserID(self.userDN) # get jediTaskID self.jediTaskID = self.taskBuffer.getTaskIDwithTaskNameJEDI( compactDN, self.userTaskName) # get prodSourceLabel self.prodSourceLabel, self.job_label = self.taskBuffer.getProdSourceLabelwithTaskID( self.jediTaskID) # convert run/event list to dataset/file list tmpRet, locationMap, allFiles = self.pd2p.convertEvtRunToDatasets( runEvtList, eventPickDataType, eventPickStreamName, eventPickDS, eventPickAmiTag, self.userDN, runEvtGuidMap, ei_api) if not tmpRet: if 'isFatal' in locationMap and locationMap['isFatal'] is True: self.ignoreError = False self.endWithError( 'Failed to convert the run/event list to a dataset/file list' ) return False # use only files in the list if inputFileList != []: tmpAllFiles = [] for tmpFile in allFiles: if tmpFile['lfn'] in inputFileList: tmpAllFiles.append(tmpFile) allFiles = tmpAllFiles # remove redundant CN from DN tmpDN = self.userDN tmpDN = re.sub('/CN=limited proxy', '', tmpDN) tmpDN = re.sub('(/CN=proxy)+$', '', tmpDN) # make dataset container tmpRet = self.pd2p.registerDatasetContainerWithDatasets( self.userDatasetName, allFiles, locationMap, nSites=eventPickNumSites, owner=tmpDN) if not tmpRet: self.endWithError('Failed to make a dataset container %s' % self.userDatasetName) return False # skip DaTRI if skipDaTRI: # successfully terminated self.putLog("skip DaTRI") # update task self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID) else: # get candidates tmpRet, candidateMaps = self.pd2p.getCandidates( self.userDatasetName, self.prodSourceLabel, self.job_label, checkUsedFile=False, useHidden=True) if not tmpRet: self.endWithError( 'Failed to find candidate for destination') return False # collect all candidates allCandidates = [] for tmpDS in candidateMaps: tmpDsVal = candidateMaps[tmpDS] for tmpCloud in tmpDsVal: tmpCloudVal = tmpDsVal[tmpCloud] for tmpSiteName in tmpCloudVal[0]: if tmpSiteName not in allCandidates: allCandidates.append(tmpSiteName) if allCandidates == []: self.endWithError('No candidate for destination') return False # get list of dataset (container) names if eventPickNumSites > 1: # decompose container to transfer datasets separately tmpRet, tmpOut = self.pd2p.getListDatasetReplicasInContainer( self.userDatasetName) if not tmpRet: self.endWithError('Failed to get replicas in %s' % self.userDatasetName) return False userDatasetNameList = list(tmpOut) else: # transfer container at once userDatasetNameList = [self.userDatasetName] # loop over all datasets sitesUsed = [] for tmpUserDatasetName in userDatasetNameList: # get size of dataset container tmpRet, totalInputSize = rucioAPI.getDatasetSize( tmpUserDatasetName) if not tmpRet: self.endWithError( 'Failed to get the size of {0} with {1}'.format( tmpUserDatasetName, totalInputSize)) return False # run brokerage tmpJob = JobSpec() tmpJob.AtlasRelease = '' self.putLog("run brokerage for %s" % tmpDS) pandaserver.brokerage.broker.schedule( [tmpJob], self.taskBuffer, self.siteMapper, True, allCandidates, True, datasetSize=totalInputSize) if tmpJob.computingSite.startswith('ERROR'): self.endWithError('brokerage failed with %s' % tmpJob.computingSite) return False self.putLog("site -> %s" % tmpJob.computingSite) # send transfer request try: tmpDN = rucioAPI.parse_dn(tmpDN) tmpStatus, userInfo = rucioAPI.finger(tmpDN) if not tmpStatus: raise RuntimeError( 'user info not found for {0} with {1}'.format( tmpDN, userInfo)) tmpDN = userInfo['nickname'] tmpSiteSpec = self.siteMapper.getSite( tmpJob.computingSite) scope_input, scope_output = select_scope( tmpSiteSpec, JobUtils.ANALY_PS, JobUtils.ANALY_PS) tmpDQ2ID = tmpSiteSpec.ddm_input[scope_input] tmpMsg = "%s ds=%s site=%s id=%s" % ( 'registerDatasetLocation for DaTRI ', tmpUserDatasetName, tmpDQ2ID, tmpDN) self.putLog(tmpMsg) rucioAPI.registerDatasetLocation( tmpDS, [tmpDQ2ID], lifetime=14, owner=tmpDN, activity="User Subscriptions") self.putLog('OK') except Exception: errType, errValue = sys.exc_info()[:2] tmpStr = 'Failed to send transfer request : %s %s' % ( errType, errValue) tmpStr.strip() tmpStr += traceback.format_exc() self.endWithError(tmpStr) return False # list of sites already used sitesUsed.append(tmpJob.computingSite) self.putLog("used %s sites" % len(sitesUsed)) # set candidates if len(sitesUsed) >= eventPickNumSites: # reset candidates to limit the number of sites allCandidates = sitesUsed sitesUsed = [] else: # remove site allCandidates.remove(tmpJob.computingSite) # send email notification for success tmpMsg = 'A transfer request was successfully sent to Rucio.\n' tmpMsg += 'Your task will get started once transfer is completed.' self.sendEmail(True, tmpMsg) try: # unlock and delete evp file fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_UN) self.evpFile.close() os.remove(self.evpFileName) except Exception: pass # successfully terminated self.putLog("end %s" % self.evpFileName) return True except Exception: errType, errValue = sys.exc_info()[:2] self.endWithError('Got exception %s:%s %s' % (errType, errValue, traceback.format_exc())) return False # end with error def endWithError(self, message): self.putLog(message, 'error') # unlock evp file try: fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_UN) self.evpFile.close() if not self.ignoreError: # remove evp file os.remove(self.evpFileName) # send email notification self.sendEmail(False, message) except Exception: pass # upload log if self.jediTaskID is not None: outLog = self.uploadLog() self.taskBuffer.updateTaskErrorDialogJEDI( self.jediTaskID, 'event picking failed. ' + outLog) # update task if not self.ignoreError: self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID, 'tobroken') self.putLog(outLog) self.putLog('end %s' % self.evpFileName) # put log def putLog(self, msg, type='debug'): tmpMsg = msg if type == 'error': self.logger.error(tmpMsg) else: self.logger.debug(tmpMsg) # send email notification def sendEmail(self, isSucceeded, message): # mail address toAdder = Notifier(self.taskBuffer, None, []).getEmail(self.userDN) if toAdder == '': self.putLog('cannot find email address for %s' % self.userDN, 'error') return # subject mailSubject = "PANDA notification for Event-Picking Request" # message mailBody = "Hello,\n\nHere is your request status for event picking\n\n" if isSucceeded: mailBody += "Status : Passed to Rucio\n" else: mailBody += "Status : Failed\n" mailBody += "Created : %s\n" % self.creationTime mailBody += "Ended : %s\n" % datetime.datetime.utcnow().strftime( '%Y-%m-%d %H:%M:%S') mailBody += "Dataset : %s\n" % self.userDatasetName mailBody += "\n" mailBody += "Parameters : %s %s\n" % (self.lockedBy, self.params) mailBody += "\n" mailBody += "%s\n" % message # send retVal = MailUtils().send(toAdder, mailSubject, mailBody) # return return # upload log def uploadLog(self): if self.jediTaskID is None: return 'cannot find jediTaskID' strMsg = self.logger.dumpToString() s, o = Client.uploadLog(strMsg, self.jediTaskID) if s != 0: return "failed to upload log with {0}.".format(s) if o.startswith('http'): return '<a href="{0}">log</a>'.format(o) return o
def run(self): try: # make a message instance tmpLog = LogWrapper(_logger, None) # run main procedure in the same process if not self.forkRun: tmpLog.debug('main start') tmpLog.debug('firstSubmission={0}'.format( self.firstSubmission)) # group jobs per VO voJobsMap = {} ddmFreeJobs = [] tmpLog.debug('{0} jobs in total'.format(len(self.jobs))) for tmpJob in self.jobs: # set VO=local for DDM free if tmpJob.destinationSE == 'local': tmpVO = 'local' else: tmpVO = tmpJob.VO # make map voJobsMap.setdefault(tmpVO, []) voJobsMap[tmpVO].append(tmpJob) # loop over all VOs for tmpVO in voJobsMap: tmpJobList = voJobsMap[tmpVO] tmpLog.debug('vo={0} has {1} jobs'.format( tmpVO, len(tmpJobList))) # get plugin setupperPluginClass = panda_config.getPlugin( 'setupper_plugins', tmpVO) if setupperPluginClass is None: # use ATLAS plug-in by default from pandaserver.dataservice.SetupperAtlasPlugin import SetupperAtlasPlugin setupperPluginClass = SetupperAtlasPlugin tmpLog.debug('plugin name -> {0}'.format( setupperPluginClass.__name__)) try: # make plugin setupperPlugin = setupperPluginClass( self.taskBuffer, self.jobs, tmpLog, resubmit=self.resubmit, pandaDDM=self.pandaDDM, ddmAttempt=self.ddmAttempt, onlyTA=self.onlyTA, firstSubmission=self.firstSubmission) # run plugin tmpLog.debug('run plugin') setupperPlugin.run() # go forward if not TA if not self.onlyTA: # update jobs tmpLog.debug('update jobs') self.updateJobs( setupperPlugin.jobs + setupperPlugin.jumboJobs, tmpLog) # execute post process tmpLog.debug('post execute plugin') setupperPlugin.postRun() tmpLog.debug('done plugin') except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('plugin failed with {0}:{1}'.format( errtype, errvalue)) tmpLog.debug('main end') else: tmpLog.debug('fork start') # write jobs to file import os try: import cPickle as pickle except ImportError: import pickle outFileName = '%s/set.%s_%s' % (panda_config.logdir, self.jobs[0].PandaID, str(uuid.uuid4())) outFile = open(outFileName, 'wb') pickle.dump(self.jobs, outFile, protocol=0) outFile.close() # run main procedure in another process because python doesn't release memory com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % ( panda_config.home_dir_cwd, panda_config.home_dir_cwd) com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \ (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python, panda_config.pandaPython_dir,outFileName) if self.onlyTA: com += " -t" if not self.firstSubmission: com += " -f" tmpLog.debug(com) # execute status, output = self.taskBuffer.processLimiter.getstatusoutput( com) tmpLog.debug("return from main process: %s %s" % (status, output)) tmpLog.debug('fork end') except Exception as e: tmpLog.error('master failed with {0} {1}'.format( str(e), traceback.format_exc()))
import datetime import traceback from pandaserver.taskbuffer.TaskBuffer import taskBuffer from pandacommon.pandalogger.PandaLogger import PandaLogger from pandacommon.pandalogger.LogWrapper import LogWrapper from pandaserver.brokerage.SiteMapper import SiteMapper # password from pandaserver.config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('prioryMassage') tmpLog = LogWrapper(_logger) tmpLog.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) # get usage breakdown usageBreakDownPerUser = {} usageBreakDownPerSite = {} workingGroupList = [] for table in ['ATLAS_PANDA.jobsActive4', 'ATLAS_PANDA.jobsArchived4']: varMap = {} varMap[':prodSourceLabel'] = 'user' varMap[':pmerge'] = 'pmerge'
def getGUIDsFromEventIndex(self, runEventList, streamName, amiTags, dataType): comment = ' /* DBProxy.getGUIDsFromEventIndex */' methodName = comment.split(' ')[-2].split('.')[-1] tmpLog = LogWrapper( _logger, methodName + " <streamName={0} amiTags={1} dataType={2}>".format( streamName, amiTags, dataType)) try: # change to list if not amiTags in [None, '']: amiTags = amiTags.replace('*', '.*').split(',') tmpLog.debug("start for {0} events".format(len(runEventList))) # check data type if not dataType in ['RAW', 'ESD', 'AOD']: return False, 'dataType={0} is unsupported'.format(dataType) # sql to insert runs and events sqlRE = "INSERT INTO {0}.TMP_RUN_EVENT_PAIRS (runNumber,eventNumber) ".format( panda_config.schemaEI) sqlRE += "VALUES (:runNumber,:eventNumber) " varMaps = [] for runNumber, eventNumber in runEventList: varMap = {} varMap[':runNumber'] = runNumber varMap[':eventNumber'] = eventNumber varMaps.append(varMap) # begin transaction self.conn.begin() self.cur.arraysize = 100000 # insert runs and events self.cur.executemany(sqlRE + comment, varMaps) # read GUIDs varMap = {} if amiTags in [None, '']: sqlRG = "SELECT runNumber,eventNumber,guid_{0} ".format( dataType) sqlRG += "FROM {0}.V_PANDA_EVPICK_NOAMITAG_MANY ".format( panda_config.schemaEI) else: sqlRG = "SELECT runNumber,eventNumber,guid_{0},amiTag ".format( dataType) sqlRG += "FROM {0}.V_PANDA_EVPICK_AMITAG_MANY ".format( panda_config.schemaEI) if not streamName in [None, '']: sqlRG += "WHERE streamName=:streamName " varMap[':streamName'] = streamName self.cur.execute(sqlRG + comment, varMap) resRG = self.cur.fetchall() # commit if not self._commit(): raise RuntimeError('Commit error') retValue = {} keyAmiIdxMap = {} for tmpItem in resRG: if amiTags in [None, '']: runNumber, eventNumber, guid = tmpItem # dummy idxTag = 0 else: runNumber, eventNumber, guid, amiTag = tmpItem # get index number for the AMI tag in the list idxTag = self.getIndexAmiTag(amiTags, amiTag) # didn't match if idxTag is None: continue tmpKey = (runNumber, eventNumber) # use AMI tag in a preference orde if tmpKey in keyAmiIdxMap and keyAmiIdxMap[tmpKey] < idxTag: continue keyAmiIdxMap[tmpKey] = idxTag retValue[tmpKey] = [guid] tmpLog.debug("found {0} events".format(len(retValue))) return True, retValue except Exception: # roll back self._rollback() # error self.dumpErrorMessage(_logger, methodName) return False, None
from pandaserver.taskbuffer.TaskBufferInterface import TaskBufferInterface try: long except NameError: long = int # password from pandaserver.config import panda_config # logger _logger = PandaLogger().getLogger('add') tmpLog = LogWrapper(_logger, None) tmpLog.debug("===================== start =====================") # overall timeout value overallTimeout = 20 # grace period try: gracePeriod = int(sys.argv[1]) except Exception: gracePeriod = 3 # current minute currentMinute = datetime.datetime.utcnow().minute # kill old process try:
def application(environ, start_response): # get method name methodName = '' if 'SCRIPT_NAME' in environ: methodName = environ['SCRIPT_NAME'].split('/')[-1] tmpLog = LogWrapper(_logger, "PID={0} {1}".format(os.getpid(), methodName), seeMem=True) cont_length = int(environ.get('CONTENT_LENGTH', 0)) json_body = environ.get('CONTENT_TYPE', None) == 'application/json' tmpLog.debug("start content-length={} json={}".format(cont_length, json_body)) regStart = datetime.datetime.utcnow() retType = None # check method name if methodName not in allowedMethods: tmpLog.error("is forbidden") exeRes = "False : %s is forbidden" % methodName else: # get method object tmpMethod = None try: tmpMethod = globals()[methodName] except Exception: pass # object not found if tmpMethod is None: tmpLog.error("is undefined") exeRes = "False" else: body = b'' try: # dummy request object dummyReq = DummyReq(environ, tmpLog) if not dummyReq.authenticated: start_response('403 Forbidden', [('Content-Type', 'text/plain')]) return ["ERROR : Token authentication failed on the server side. {}".format( dummyReq.message).encode()] username = dummyReq.subprocess_env.get('SSL_CLIENT_S_DN', None) if username: username = CoreUtils.clean_user_id(username) if username in ban_user_list: errMsg = '{} is banned'.format(username) tmpLog.warning(errMsg) start_response('403 Forbidden', [('Content-Type', 'text/plain')]) return ["ERROR : {}".format(errMsg).encode()] # read contents while cont_length > 0: chunk = environ['wsgi.input'].read(min(cont_length, 1024*1024)) if not chunk: break cont_length -= len(chunk) body += chunk if cont_length > 0: raise OSError('partial read from client. {} bytes remaining'.format(cont_length)) if not json_body: # query string environ['wsgi.input'] = io.BytesIO(body) # get params tmpPars = cgi.FieldStorage(environ['wsgi.input'], environ=environ, keep_blank_values=1) # convert to map params = {} for tmpKey in list(tmpPars): if tmpPars[tmpKey].file is not None and tmpPars[tmpKey].filename is not None: # file params[tmpKey] = tmpPars[tmpKey] else: # string params[tmpKey] = tmpPars.getfirst(tmpKey) else: # json body = gzip.decompress(body) params = json.loads(body) if panda_config.entryVerbose: tmpLog.debug("with %s" % str(list(params))) param_list = [dummyReq] # exec exeRes = tmpMethod(*param_list, **params) # extract return type if isinstance(exeRes, dict): retType = exeRes['type'] exeRes = exeRes['content'] # convert bool to string if exeRes in [True,False]: exeRes = str(exeRes) except Exception as e: tmpLog.error("execution failure : {0}\n {1}".format(str(e), traceback.format_exc())) if hasattr(panda_config, 'dumpBadRequest') and panda_config.dumpBadRequest: try: with tempfile.NamedTemporaryFile(delete=False, prefix='req_dump_') as f: environ['WSGI_INPUT_DUMP'] = f.name f.write(body) os.chmod(f.name, 0o775) except Exception: tmpLog.error(traceback.format_exc()) pass errStr = "" for tmpKey in environ: tmpVal = environ[tmpKey] errStr += "%s : %s\n" % (tmpKey,str(tmpVal)) tmpLog.error(errStr) # return internal server error start_response('500 INTERNAL SERVER ERROR', [('Content-Type', 'text/plain')]) # force kill to release memory if type(e) == OSError: tmpLog.warning('force restart due') os.kill(os.getpid(), signal.SIGINT) return [str(e).encode()] if panda_config.entryVerbose: tmpLog.debug("done") regTime = datetime.datetime.utcnow() - regStart tmpLog.info("exec_time=%s.%03d sec, return len=%s B" % (regTime.seconds, regTime.microseconds/1000, len(str(exeRes)))) # return if exeRes == pandaserver.taskbuffer.ErrorCode.EC_NotFound: start_response('404 Not Found', [('Content-Type', 'text/plain')]) return ['not found'.encode()] elif isinstance(exeRes, pandaserver.taskbuffer.ErrorCode.EC_Redirect): start_response('302 Redirect', [('Location', exeRes.url)]) return ['redirect'.encode()] else: if retType == 'json': start_response('200 OK', [('Content-Type', 'application/json')]) else: start_response('200 OK', [('Content-Type', 'text/plain')]) if isinstance(exeRes, str): exeRes = exeRes.encode() return [exeRes]