def __init__(self, rank=None, ATHENA_PROC_NUMBER=1, workingDir=None): self.__rank = rank self.__name = "EventServerJobManager" self.__eventRangeChannelName = "EventRangeChannel" self.__eventRanges = [] self.__eventRangesStatus = {} self.__outputMessage = [] self.__messageQueue = multiprocessing.Queue() self.__messageInQueue = multiprocessing.Queue() self.__messageThread = None self.__TokenExtractorCmd = None self.__TokenExtractorProcess = None self.__athenaMPProcess = None self.__athenaMP_isReady = False self.__athenaMP_needEvents = 0 self.__pollTimeout = 5 self.__child_pid = None self.__child_cpuTime = {} if workingDir: self.__log = Logger.Logger( filename=os.path.join(workingDir, 'EventServiceManager.log')) else: self.__log = Logger.Logger(filename='EventServiceManager.log') self.__childProcs = [] self.__isKilled = False self.__waitTerminate = False self.__waitTerminateTime = 1800 self.__startTerminateTime = None self.__noMoreEvents = False self.__insertedMessages = 0 self.__ATHENA_PROC_NUMBER = int(ATHENA_PROC_NUMBER) self.__numOutputs = 0 self.initSignalHandler() self.__childRetStatus = 0 self.__retry = 0 self.__errEvent = False # accounting self.__startTime = time.time() self.__readyForEventTime = None self.__endTime = None self.__startOSTimes = os.times() self.__log.debug("Rank %s: startOSTimes: %s" % (self.__rank, self.__startOSTimes)) self.__endOSTimes = None self.__totalQueuedEvents = 0 self.__totalProcessedEvents = 0 self.__cpuConsumptionTime = 0 self.__helperThread = None
def __init__(self, globalWorkingDir, localWorkingDir, outputs=None, job=None, esJobManager=None, outputDir=None, rank=None, logger=None): threading.Thread.__init__(self) self.__globalWorkingDir = globalWorkingDir self.__localWorkingDir = localWorkingDir self.__currentDir = None self.__rank = rank if logger and False: self.__tmpLog = logger else: curdir = _abspath (self.__localWorkingDir) wkdirname = "rank_%s" % str(self.__rank) wkdir = _abspath (_join(curdir,wkdirname)) self.__tmpLog = Logger.Logger(filename=os.path.join(wkdir, 'Droid.log')) self.__job = job self.__esJobManager = esJobManager self.__stop = threading.Event() self.__isFinished = False self.__tmpLog.info("Rank %s: Global working dir: %s" % (self.__rank, self.__globalWorkingDir)) os.environ['PilotHomeDir'] = os.path.dirname(self.__globalWorkingDir) self.__jobId = None self.__copyOutputToGlobal = False self.__outputDir = outputDir self.__hostname = socket.getfqdn() self.__outputs = outputs self.__threadpool = None self.setup(job)
def __init__(self, globalWorkingDir, localWorkingDir, rank=None, nonMPIMode=False, reserveCores=0, outputDir=None): threading.Thread.__init__(self) self.__globalWorkingDir = globalWorkingDir self.__localWorkingDir = localWorkingDir self.__currentDir = None self.__tmpLog = Logger.Logger(filename='Droid.log') self.__comm = Interaction.Requester(rank=rank, nonMPIMode=nonMPIMode, logger=self.__tmpLog) self.__esJobManager = None self.__isFinished = False if nonMPIMode: self.__rank = rank else: self.__rank = self.__comm.getRank() self.__tmpLog.info("Rank %s: Global working dir: %s" % (self.__rank, self.__globalWorkingDir)) if not os.environ.has_key('PilotHomeDir'): os.environ['PilotHomeDir'] = self.__globalWorkingDir self.initWorkingDir() self.__tmpLog.info("Rank %s: Current working dir: %s" % (self.__rank, self.__currentDir)) self.__jobId = None self.__startTimeOneJobDroid = None self.__cpuTimeOneJobDroid = None self.__poolFileCatalog = None self.__inputFiles = None self.__copyInputFiles = None self.__preSetup = None self.__postRun = None self.__ATHENA_PROC_NUMBER = 1 self.__firstGetEventRanges = True self.__outputDir = outputDir self.__yodaToOS = False self.reserveCores = reserveCores self.__hostname = socket.getfqdn() self.__outputs = Queue() self.__jobMetrics = {} self.__stagerThread = None self.__stop = False if not nonMPIMode: signal.signal(signal.SIGTERM, self.stop) signal.signal(signal.SIGQUIT, self.stop) signal.signal(signal.SIGSEGV, self.stop) signal.signal(signal.SIGXCPU, self.stop) signal.signal(signal.SIGUSR1, self.stop) signal.signal(signal.SIGBUS, self.stop)
def __init__(self, messageQ, socketname, context, **kwds): threading.Thread.__init__(self, **kwds) self.__log = Logger.Logger(filename='EventServiceManager.log') self.__messageQ = messageQ self._stop = threading.Event() try: self.__messageSrv = yampl.ServerSocket(socketname, context) except: self.__log.debug("Exception: failed to start yampl server socket: %s" % traceback.format_exc())
def __init__(self, globalWorkingDir, localWorkingDir): self.__globalWorkingDir = globalWorkingDir self.__localWorkingDir = localWorkingDir self.__currentDir = None self.__comm = Interaction.Requester() self.__tmpLog = Logger.Logger() self.__esJobManager = None self.__rank = self.__comm.getRank() self.__tmpLog.info("Rank %s: Global working dir: %s" % (self.__rank, self.__globalWorkingDir)) self.initWorkingDir() self.__tmpLog.info("Rank %s: Current working dir: %s" % (self.__rank, self.__currentDir)) self.__poolFileCatalog = None self.__inputFiles = None self.__copyInputFiles = None signal.signal(signal.SIGTERM, self.stop)
] job = {'PandaID':'123', 'jobsetID':'567', } f = open('job_pickle.txt','w') pickle.dump(job,f) f.close() f = open('eventranges_pickle.txt','w') pickle.dump(er,f) f.close() # get logger tmpLog = Logger.Logger() comm = MPI.COMM_WORLD mpirank = comm.Get_rank() if mpirank==0: yoda = Yoda.Yoda() yoda.run() else: snd = Interaction.Requester() tmpLog.debug("rank{0} sending req".format(mpirank)) tmpStat,jobData = snd.sendRequest('getJob',{'siteName':'TEST'}) while True: tmpStat,res = snd.sendRequest('getEventRanges',{'pandaID':jobData['PandaID'], 'jobsetID':jobData['jobsetID']})
def setup(self, job): try: self.__jobId = job.get("JobId", None) self.__startTimeOneJobDroid = time.time() self.__cpuTimeOneJobDroid = os.times() self.__poolFileCatalog = job.get('PoolFileCatalog', None) self.__inputFiles = job.get('InputFiles', None) self.__copyInputFiles = job.get('CopyInputFiles', False) self.__preSetup = job.get('PreSetup', None) self.__postRun = job.get('PostRun', None) self.__yodaToOS = job.get('yodaToOS', False) self.__ATHENA_PROC_NUMBER = int(job.get('ATHENA_PROC_NUMBER', 1)) self.__ATHENA_PROC_NUMBER -= self.reserveCores if self.__ATHENA_PROC_NUMBER < 0: self.__ATHENA_PROC_NUMBER = 1 job["AthenaMPCmd"] = "export ATHENA_PROC_NUMBER=" + str( self.__ATHENA_PROC_NUMBER) + "; " + job["AthenaMPCmd"] self.__jobWorkingDir = job.get('GlobalWorkingDir', None) if self.__jobWorkingDir: self.__jobWorkingDir = os.path.join(self.__jobWorkingDir, 'rank_%s' % self.__rank) if not os.path.exists(self.__jobWorkingDir): os.makedirs(self.__jobWorkingDir) os.chdir(self.__jobWorkingDir) logFile = os.path.join(self.__jobWorkingDir, 'Droid.log') logging.basicConfig(filename=logFile, level=logging.DEBUG) self.__tmpLog = Logger.Logger() if self.__copyInputFiles and self.__inputFiles is not None and self.__poolFileCatalog is not None: for inputFile in self.__inputFiles: shutil.copy(inputFile, './') pfc_name = os.path.basename(self.__poolFileCatalog) pfc_name = os.path.join(os.getcwd(), pfc_name) pfc_name_back = pfc_name + ".back" shutil.copy2(self.__poolFileCatalog, pfc_name_back) with open(pfc_name, 'wt') as pfc_out: with open(pfc_name_back, 'rt') as pfc_in: for line in pfc_in: pfc_out.write( line.replace('HPCWORKINGDIR', os.getcwd())) job["AthenaMPCmd"] = job["AthenaMPCmd"].replace( 'HPCWORKINGDIR', os.getcwd()) self.__esJobManager = EventServerJobManager( self.__rank, self.__ATHENA_PROC_NUMBER, workingDir=self.__jobWorkingDir) status, output = self.__esJobManager.preSetup(self.__preSetup) if status != 0: return False, output status, output = self.startStagerThread(job) if status != 0: self.__tmpLog.warning( "Rank %s: failed to start stager thread(status: %s, output: %s)" % (self.__rank, status, output)) return False, output # self.__esJobManager.initMessageThread(socketname='EventService_EventRanges', context='local') # self.__esJobManager.initTokenExtractorProcess(job["TokenExtractCmd"]) # self.__esJobManager.initAthenaMPProcess(job["AthenaMPCmd"]) ret = self.__esJobManager.init( socketname='EventService_EventRanges', context='local', athenaMPCmd=job["AthenaMPCmd"], tokenExtractorCmd=job["TokenExtractCmd"]) return True, None except: errMsg = "Failed to init EventServerJobManager: %s" % str( traceback.format_exc()) self.__esJobManager.terminate() return False, errMsg