def _track_job_log(): with open(self.jobLog) as f: status = None while status not in TERMINAL_STATUS_CODES: events = htcondor.read_events(f) for event in events: eventType = event['MyType'] if eventType in LOGFILE_STATUS_CODES: status = LOGFILE_STATUS_CODES[eventType] elif eventType == 6: continue else: # something went wronß running = [SCHEDD_STATUS_CODES[2], SCHEDD_STATUS_CODES[7], SCHEDD_STATUS_CODES[8]] if status in running: logger.info('Job is running') elif status == 'H': logger.info('Job is on hold - something went wrong.') elif status == 'Idle': logger.info('Job is pending') elif status == 'Error': logger.error( 'Job has failed:\n' + '\n'.join(self._fetch_task_failures())) elif job_status in TERMINAL_STATUS_CODES: logger.info('Job is done')
def wait_log(self, ulog): """ Wait for a job to finish """ data = {} if StrictVersion(VERSION) < StrictVersion('8.7.10'): fp = open(ulog) events = htcondor.read_events(fp) while True: try: r = events.next() except StopIteration: log.debug("No Event but stopiter") time.sleep(2.2) else: self.process_event(r, data) log.debug(data) if self._is_terminal(data): break else: for r in htcondor.JobEventLog(ulog).events(None): self.process_event(r, data) log.debug(data) if self._is_terminal(data): break log.debug("all jobs terminal")
def readCondorLog(self, job): """ __readCondorLog If schedd fails to give information about a job Check the condor log file for ths job Extract Exit status """ def LogToScheddExitCodeMap(x): ### JobStatus shows the last status of the job ### Get TriggerEventTypeNumber which is the current status of the job ### Map it back to Schedd Status ### Mapping done using the exit codes from condor website, ### https://htcondor-wiki.cs.wisc.edu/index.cgi/wiki?p=MagicNumbers LogExitCode={0:1,1:1,2:0,3:2,4:3,5:4,6:2,7:0,8:0,9:0,10:0,11:1,12:5,13:2} n = LogExitCode.get(x) if LogExitCode.get(x) is not None else 100 return n ### This should select the latest log file in the cache_dir fmtime=0 logFile=None for joblog in os.listdir(job['cache_dir']): if fnmatch.fnmatch(joblog, 'condor.*.*.log'): _tmplogFile=os.path.join(job['cache_dir'],joblog) _tmpfmtime=int(os.path.getmtime(_tmplogFile)) if _tmpfmtime > fmtime: fmtime = _tmpfmtime logFile = _tmplogFile jobLogInfo={} try : logging.debug("Opening condor job log file: %s" % logFile) logfileobj=open(logFile,"r") except : logging.debug('Cannot open condor job log file %s'% logFile) else : tmpDict={} cres=condor.read_events(logfileobj,1) ulog=list(cres) if len(ulog) > 0: _tmpStat = int(ulog[-1]["TriggerEventTypeNumber"]) tmpDict["JobStatus"]=LogToScheddExitCodeMap(_tmpStat) tmpDict["submitTime"]=int(ulog[-1]["QDate"]) tmpDict["runningTime"]=int(ulog[-1]["JobStartDate"]) if ulog[-1]["JobStartDate"] is not None else 0 tmpDict["stateTime"]=int(ulog[-1]["EnteredCurrentStatus"]) tmpDict["runningCMSSite"]=ulog[-1]["MachineAttrGLIDEIN_CMSSite0"] tmpDict["WMAgentID"]=int(ulog[-1]["WMAgent_JobID"]) _tmpID = tmpDict["WMAgentID"] jobLogInfo[_tmpID] = tmpDict else : logging.debug('%s is EMPTY' % str(logFile)) logging.info("Retrieved %i Info from Condor Job Log file %s" % (len(jobLogInfo), logFile)) return jobLogInfo
def readCondorLog(self, job): """ __readCondorLog If schedd fails to give information about a job Check the condor log file for this job Extract Exit status """ ### This should select the latest log file in the cache_dir fmtime = 0 logFile = None jobLogInfo = {} if not os.path.exists(job['cache_dir']): logging.info('%s does not exist.', job['cache_dir']) return jobLogInfo for joblog in os.listdir(job['cache_dir']): if fnmatch.fnmatch(joblog, 'condor.*.*.log'): _tmplogFile = os.path.join(job['cache_dir'], joblog) _tmpfmtime = int(os.path.getmtime(_tmplogFile)) if _tmpfmtime > fmtime: fmtime = _tmpfmtime logFile = _tmplogFile try: logging.debug("Opening condor job log file: %s", logFile) logfileobj = open(logFile, "r") except: logging.debug('Cannot open condor job log file %s', logFile) else: tmpDict = {} cres = condor.read_events(logfileobj, 1) ulog = list(cres) if len(ulog) > 0: if all(key in ulog[-1] for key in ("TriggerEventTypeNumber", "QDate", "JobStartDate", "EnteredCurrentStatus", "MATCH_EXP_JOBGLIDEIN_CMSSite", "WMAgent_JobID")): _tmpStat = int(ulog[-1]["TriggerEventTypeNumber"]) tmpDict["JobStatus"] = PyCondorPlugin.logToScheddExitCodeMap(_tmpStat) tmpDict["submitTime"] = int(ulog[-1]["QDate"]) tmpDict["runningTime"] = int(ulog[-1]["JobStartDate"]) tmpDict["stateTime"] = int(ulog[-1]["EnteredCurrentStatus"]) tmpDict["runningCMSSite"] = ulog[-1]["MATCH_EXP_JOBGLIDEIN_CMSSite"] tmpDict["WMAgentID"] = int(ulog[-1]["WMAgent_JobID"]) jobLogInfo[tmpDict["WMAgentID"]] = tmpDict else: logging.debug('%s is CORRUPT', str(logFile)) else: logging.debug('%s is EMPTY', str(logFile)) logging.info("Retrieved %i Info from Condor Job Log file %s", len(jobLogInfo), logFile) return jobLogInfo
def main(): #Get the path of submitfile path = Flags.submit #Get the name of output file,logfile em = ExtractMacros(path) #Get the macro value output = em.extract('transfer_output_files') if output.has_key('transfer_output_files'): if len(output['transfer_output_files']) != 0: output = em.extract( 'transfer_output_files')['transfer_output_files'][0] else: output = None log = em.extract('Log') if log.has_key('Log'): if len(log['Log']) != 0: log = em.extract('Log')['Log'][0] else: log = None if (output != None) and os.path.exists(output): #读取当前的y值 with open(output, "r") as f: y = int(f.readline()) #analyse log file m = 0 if log != None: iter = htcondor.read_events(open(log)) #read the memory usage for i in iter: if i['MyType'] == 'JobImageSizeEvent': m = i['MemoryUsage'] #更新loopfile with open(r'loopfile', 'r') as f: temp = cPickle.load(f) #Update the current y temp['currentY'] = y #Update the list of histrical y temp['yhistory'].append(y) #Update the number of loops: temp['nloops'] += 1 #update the memory usage temp['mems'] += m with open(r'loopfile', 'w') as f: print 'currentY: ', y cPickle.dump(temp, f) else: print 'before is a noop job' #for a but time.sleep(21)
def status_from_logfile(): with open(LOGFILE) as f: status = None while status not in TERMINAL_STATUS_CODES: events = htcondor.read_events(f) for event in events: eventType = event['MyType'] eventTypeID = event['EventTypeNumber'] if eventTypeID in LOGFILE_STATUS_CODES: status = LOGFILE_STATUS_CODES[eventTypeID] print("From JOBFILE:", status)
def monitor_job(self): ''' monitor job finishing time :return: ''' logfile = self.find_log() #monitoring when job finish file = open("./"+logfile) cnt = 0 while 1: #打印时间 sys.stdout.write('Used time: '+str(cnt) + "\r") #monitoring tail of log where = file.tell() line = file.readline() if not line: time.sleep(1) file.seek(where) sys.stdout.flush() cnt += 1 else: if "Job terminated" in line: print "Job finished!" break #analyze the log iterator = htcondor.read_events(open("./"+logfile)) loglist = [] while True: a = dict(iterator.next()) # parse the datetime if a['MyType'] == 'SubmitEvent': stime = a['EventTime'].replace('T',' ') st = datetime.strptime(stime, '%Y-%m-%d %H:%M:%S') cluster = a['Cluster'] if a['MyType'] == 'JobTerminatedEvent': etime = a['EventTime'].replace('T',' ') et = datetime.strptime(etime, '%Y-%m-%d %H:%M:%S') cluster = a['Cluster'] break print '-----------Analysis Result-------------:' print 'Job use %0.2f s in Cluster %s' % ((et-st).seconds, cluster)
def testEventLog(self): events = list(htcondor.read_events(open("tests/test_log.txt"))) self.assertEquals(len(events), 4) a = dict(events[0]) if 'CurrentTime' in a: del a['CurrentTime'] b = {"LogNotes": "DAG Node: Job1", "MyType": "SubmitEvent", "EventTypeNumber": 0, "Subproc": 0, "Cluster": 236467, "Proc": 0, "EventTime": "%d-11-15T17:05:55" % datetime.datetime.now().year, "SubmitHost": "<169.228.38.38:9615?sock=18627_6227_3>", } self.assertEquals(set(a.keys()), set(b.keys())) for key, val in a.items(): self.assertEquals(val, b[key])
def setUp(self): # Note we cannot use a temporary file here; the event reader # is based on *filenames* (which are not visible for TemporaryFile), # not file descriptors. self.testname = "tests/test_event_reader.log" self.testfile = open(self.testname, "w") self.testfile.write(open("tests/job.log", "r").read()) self.testfile.flush() self.testfile = open(self.testname, "r") self.reader = htcondor.read_events(self.testfile) self.sampleEvent = { \ 'MyType': "JobImageSizeEvent", 'EventTypeNumber': 6, 'Subproc': 0, 'Cluster': 23515, 'Proc': 0, 'MemoryUsage': 1, 'Size': 260, 'ResidentSetSize': 252, 'EventTime': 0, } self.sampleEventText = SAMPLE_EVENT_TEXT
def main(): """ Process all of the output from the sites """ site = sys.argv[1] # First open the logfile logfile = open(os.path.join(site, "%s.log" % site)) # Read in the events events = htcondor.read_events(logfile) # Tests is a ClusterId.ProcId indexed dictionary, so we overwrite subsequent # events. tests = {} for event in events: tmpTest = Test() if 'TriggerEventTypeName' in event and event['TriggerEventTypeName'] == "ULOG_JOB_TERMINATED": # A finished event if 'Chirp_StashCp_DlTimeMs' in event and event['Chirp_StashCp_DlTimeMs'] != "": tmpTest.duration = float(event['Chirp_StashCp_DlTimeMs']) / 1000 if 'Chirp_TransferSuccess' in event and event['Chirp_TransferSuccess'] == True: tmpTest.success = True if "Chirp_StashCp_Prefix" in event and event["Chirp_StashCp_Prefix"] != "": tmpTest.cache = event["Chirp_StashCp_Prefix"] tests["%i.%i" % (event['Cluster'], event['Proc']) ] = tmpTest.__dict__ outputfile = "postprocess.%s.json" % site with open(outputfile, 'w') as f: f.write(json.dumps(tests.values())) return 0
def setUp(self): # Note we cannot use a temporary file here; the event reader # is based on *filenames* (which are not visible for TemporaryFile), # not file descriptors. self.testname = "tests/test_event_reader.log" self.testfile = open(self.testname, "w") self.testfile.write(open("tests/job.log", "r").read()) self.testfile.flush() self.testfile = open(self.testname, "r") self.reader = htcondor.read_events(self.testfile) self.sampleEvent = { \ 'MyType': "JobImageSizeEvent", 'EventTypeNumber': 6, 'Subproc': 0, 'Cluster': 23515, 'Proc': 0, 'MemoryUsage': 1, 'Size': 260, 'ResidentSetSize': 252, 'CurrentTime': 0, 'EventTime': 0, } self.sampleEventText = SAMPLE_EVENT_TEXT