Exemplo n.º 1
0
 def _track_job_log():
     with open(self.jobLog) as f:
         status = None
         while status not in TERMINAL_STATUS_CODES:
             events = htcondor.read_events(f)
             for event in events:
                 eventType = event['MyType']
                 if eventType in LOGFILE_STATUS_CODES:
                     status = LOGFILE_STATUS_CODES[eventType]
                 elif eventType == 6:
                     continue
                 else:
                     # something went wronß
     running = [SCHEDD_STATUS_CODES[2],
                SCHEDD_STATUS_CODES[7], SCHEDD_STATUS_CODES[8]]
     if status in running:
         logger.info('Job is running')
     elif status == 'H':
         logger.info('Job is on hold - something went wrong.')
     elif status == 'Idle':
         logger.info('Job is pending')
     elif status == 'Error':
         logger.error(
             'Job has failed:\n' + '\n'.join(self._fetch_task_failures()))
     elif job_status in TERMINAL_STATUS_CODES:
         logger.info('Job is done')
Exemplo n.º 2
0
    def wait_log(self, ulog):
        """ Wait for a job to finish """

        data = {}
        if StrictVersion(VERSION) < StrictVersion('8.7.10'):

            fp = open(ulog)
            events = htcondor.read_events(fp)
            while True:
                try:
                    r = events.next()
                except StopIteration:
                    log.debug("No Event but stopiter")
                    time.sleep(2.2)
                else:
                    self.process_event(r, data)
                    log.debug(data)
                    if self._is_terminal(data):
                        break
        else:
            for r in htcondor.JobEventLog(ulog).events(None):
                self.process_event(r, data)
                log.debug(data)
                if self._is_terminal(data):
                    break
        log.debug("all jobs terminal")
Exemplo n.º 3
0
    def readCondorLog(self, job):
        """
        __readCondorLog
        
        If schedd fails to give information about a job
        Check the condor log file for ths job
        Extract Exit status

        """
        def LogToScheddExitCodeMap(x):
            ### JobStatus shows the last status of the job
            ### Get TriggerEventTypeNumber which is the current status of the job
            ### Map it back to Schedd Status
            ### Mapping done using the exit codes from condor website,
            ### https://htcondor-wiki.cs.wisc.edu/index.cgi/wiki?p=MagicNumbers
            LogExitCode={0:1,1:1,2:0,3:2,4:3,5:4,6:2,7:0,8:0,9:0,10:0,11:1,12:5,13:2}
            n = LogExitCode.get(x) if LogExitCode.get(x) is not None else 100
            return n


        ### This should select the latest log file in the cache_dir
        fmtime=0
        logFile=None
        for joblog in os.listdir(job['cache_dir']):
            if fnmatch.fnmatch(joblog, 'condor.*.*.log'):
                _tmplogFile=os.path.join(job['cache_dir'],joblog)
                _tmpfmtime=int(os.path.getmtime(_tmplogFile))
                if _tmpfmtime > fmtime:
                    fmtime = _tmpfmtime
                    logFile = _tmplogFile


        jobLogInfo={}
        try :
            logging.debug("Opening condor job log file: %s" % logFile)
            logfileobj=open(logFile,"r")
        except :
            logging.debug('Cannot open condor job log file %s'% logFile)
        else :
            tmpDict={}
            cres=condor.read_events(logfileobj,1)
            ulog=list(cres)
            if len(ulog) > 0:
                _tmpStat = int(ulog[-1]["TriggerEventTypeNumber"])
                tmpDict["JobStatus"]=LogToScheddExitCodeMap(_tmpStat)
                tmpDict["submitTime"]=int(ulog[-1]["QDate"])
                tmpDict["runningTime"]=int(ulog[-1]["JobStartDate"]) if ulog[-1]["JobStartDate"] is not None else 0
                tmpDict["stateTime"]=int(ulog[-1]["EnteredCurrentStatus"])
                tmpDict["runningCMSSite"]=ulog[-1]["MachineAttrGLIDEIN_CMSSite0"]
                tmpDict["WMAgentID"]=int(ulog[-1]["WMAgent_JobID"])
                _tmpID = tmpDict["WMAgentID"]
                jobLogInfo[_tmpID] = tmpDict
            else :
                logging.debug('%s is EMPTY' % str(logFile))

        logging.info("Retrieved %i Info from Condor Job Log file %s" % (len(jobLogInfo), logFile))
                    
        return jobLogInfo
Exemplo n.º 4
0
    def readCondorLog(self, job):
        """
        __readCondorLog

        If schedd fails to give information about a job
        Check the condor log file for this job
        Extract Exit status

        """
        ### This should select the latest log file in the cache_dir
        fmtime = 0
        logFile = None
        jobLogInfo = {}
        if not os.path.exists(job['cache_dir']):
            logging.info('%s does not exist.', job['cache_dir'])
            return jobLogInfo

        for joblog in os.listdir(job['cache_dir']):
            if fnmatch.fnmatch(joblog, 'condor.*.*.log'):
                _tmplogFile = os.path.join(job['cache_dir'], joblog)
                _tmpfmtime = int(os.path.getmtime(_tmplogFile))
                if _tmpfmtime > fmtime:
                    fmtime = _tmpfmtime
                    logFile = _tmplogFile

        try:
            logging.debug("Opening condor job log file: %s", logFile)
            logfileobj = open(logFile, "r")
        except:
            logging.debug('Cannot open condor job log file %s', logFile)
        else:
            tmpDict = {}
            cres = condor.read_events(logfileobj, 1)
            ulog = list(cres)
            if len(ulog) > 0:
                if all(key in ulog[-1] for key in ("TriggerEventTypeNumber", "QDate", "JobStartDate",
                                                   "EnteredCurrentStatus", "MATCH_EXP_JOBGLIDEIN_CMSSite",
                                                   "WMAgent_JobID")):

                    _tmpStat = int(ulog[-1]["TriggerEventTypeNumber"])
                    tmpDict["JobStatus"] = PyCondorPlugin.logToScheddExitCodeMap(_tmpStat)
                    tmpDict["submitTime"] = int(ulog[-1]["QDate"])
                    tmpDict["runningTime"] = int(ulog[-1]["JobStartDate"])
                    tmpDict["stateTime"] = int(ulog[-1]["EnteredCurrentStatus"])
                    tmpDict["runningCMSSite"] = ulog[-1]["MATCH_EXP_JOBGLIDEIN_CMSSite"]
                    tmpDict["WMAgentID"] = int(ulog[-1]["WMAgent_JobID"])
                    jobLogInfo[tmpDict["WMAgentID"]] = tmpDict
                else:
                    logging.debug('%s is CORRUPT', str(logFile))
            else:
                logging.debug('%s is EMPTY', str(logFile))

        logging.info("Retrieved %i Info from Condor Job Log file %s", len(jobLogInfo), logFile)

        return jobLogInfo
Exemplo n.º 5
0
def main():
    #Get the path of submitfile
    path = Flags.submit
    #Get the name of output file,logfile
    em = ExtractMacros(path)

    #Get the macro value
    output = em.extract('transfer_output_files')
    if output.has_key('transfer_output_files'):
        if len(output['transfer_output_files']) != 0:
            output = em.extract(
                'transfer_output_files')['transfer_output_files'][0]
    else:
        output = None

    log = em.extract('Log')
    if log.has_key('Log'):
        if len(log['Log']) != 0:
            log = em.extract('Log')['Log'][0]
    else:
        log = None

    if (output != None) and os.path.exists(output):
        #读取当前的y值
        with open(output, "r") as f:
            y = int(f.readline())
        #analyse log file
        m = 0
        if log != None:
            iter = htcondor.read_events(open(log))
            #read the memory usage
            for i in iter:
                if i['MyType'] == 'JobImageSizeEvent':
                    m = i['MemoryUsage']

        #更新loopfile
        with open(r'loopfile', 'r') as f:
            temp = cPickle.load(f)
        #Update the current y
        temp['currentY'] = y
        #Update the list of histrical y
        temp['yhistory'].append(y)
        #Update the number of loops:
        temp['nloops'] += 1
        #update the memory usage
        temp['mems'] += m

        with open(r'loopfile', 'w') as f:
            print 'currentY:  ', y
            cPickle.dump(temp, f)
    else:
        print 'before is a noop job'

    #for a but
    time.sleep(21)
Exemplo n.º 6
0
def status_from_logfile():
    with open(LOGFILE) as f:
        status = None
        while status not in TERMINAL_STATUS_CODES:
            events = htcondor.read_events(f)
            for event in events:
                eventType = event['MyType']
                eventTypeID = event['EventTypeNumber']
                if eventTypeID in LOGFILE_STATUS_CODES:
                    status = LOGFILE_STATUS_CODES[eventTypeID]
                print("From JOBFILE:", status)
Exemplo n.º 7
0
    def monitor_job(self):
        '''
        monitor job finishing time
       :return:
        '''
        logfile = self.find_log()
        #monitoring when job finish
        file = open("./"+logfile)
        cnt = 0
        while 1:
            #打印时间
            sys.stdout.write('Used time: '+str(cnt) + "\r")
            #monitoring tail of log
            where = file.tell()
            line = file.readline()
            if not line:
                time.sleep(1)
                file.seek(where)
                sys.stdout.flush()
                cnt += 1
            else:
                if "Job terminated" in line:
                    print "Job finished!"
                    break



        #analyze the log
        iterator = htcondor.read_events(open("./"+logfile))
        loglist = []
        while True:
            a = dict(iterator.next())
            # parse the datetime
            if a['MyType'] == 'SubmitEvent':
                stime = a['EventTime'].replace('T',' ')
                st = datetime.strptime(stime, '%Y-%m-%d %H:%M:%S')
                cluster = a['Cluster']

            if a['MyType'] == 'JobTerminatedEvent':
                etime = a['EventTime'].replace('T',' ')

                et = datetime.strptime(etime, '%Y-%m-%d %H:%M:%S')
                cluster = a['Cluster']
                break
        print '-----------Analysis Result-------------:'
        print 'Job use %0.2f s in Cluster %s' % ((et-st).seconds, cluster)
Exemplo n.º 8
0
 def testEventLog(self):
     events = list(htcondor.read_events(open("tests/test_log.txt")))
     self.assertEquals(len(events), 4)
     a = dict(events[0])
     if 'CurrentTime' in a:
         del a['CurrentTime']
     b = {"LogNotes": "DAG Node: Job1",
          "MyType": "SubmitEvent",
          "EventTypeNumber": 0,
          "Subproc": 0,
          "Cluster": 236467,
          "Proc": 0,
          "EventTime": "%d-11-15T17:05:55" % datetime.datetime.now().year,
          "SubmitHost": "<169.228.38.38:9615?sock=18627_6227_3>",
         }
     self.assertEquals(set(a.keys()), set(b.keys()))
     for key, val in a.items():
         self.assertEquals(val, b[key])
Exemplo n.º 9
0
 def testEventLog(self):
     events = list(htcondor.read_events(open("tests/test_log.txt")))
     self.assertEquals(len(events), 4)
     a = dict(events[0])
     if 'CurrentTime' in a:
         del a['CurrentTime']
     b = {"LogNotes": "DAG Node: Job1",
          "MyType": "SubmitEvent",
          "EventTypeNumber": 0,
          "Subproc": 0,
          "Cluster": 236467,
          "Proc": 0,
          "EventTime": "%d-11-15T17:05:55" % datetime.datetime.now().year,
          "SubmitHost": "<169.228.38.38:9615?sock=18627_6227_3>",
         }
     self.assertEquals(set(a.keys()), set(b.keys()))
     for key, val in a.items():
         self.assertEquals(val, b[key])
Exemplo n.º 10
0
 def setUp(self):
     # Note we cannot use a temporary file here; the event reader
     # is based on *filenames* (which are not visible for TemporaryFile),
     # not file descriptors.
     self.testname = "tests/test_event_reader.log"
     self.testfile = open(self.testname, "w")
     self.testfile.write(open("tests/job.log", "r").read())
     self.testfile.flush()
     self.testfile = open(self.testname, "r")
     self.reader = htcondor.read_events(self.testfile)
     self.sampleEvent = { \
         'MyType': "JobImageSizeEvent",
         'EventTypeNumber': 6,
         'Subproc': 0,
         'Cluster': 23515,
         'Proc': 0,
         'MemoryUsage': 1,
         'Size': 260,
         'ResidentSetSize': 252,
         'EventTime': 0,
     }
     self.sampleEventText = SAMPLE_EVENT_TEXT
Exemplo n.º 11
0
def main():
    """
    Process all of the output from the sites
    """
    
    site = sys.argv[1]
    
    # First open the logfile
    logfile = open(os.path.join(site, "%s.log" % site))
    
    # Read in the events
    events = htcondor.read_events(logfile)
    
    # Tests is a ClusterId.ProcId indexed dictionary, so we overwrite subsequent
    # events.
    tests = {}
    for event in events:
        tmpTest = Test()
        if 'TriggerEventTypeName' in event and event['TriggerEventTypeName'] == "ULOG_JOB_TERMINATED":
            # A finished event
            
            if 'Chirp_StashCp_DlTimeMs' in event and event['Chirp_StashCp_DlTimeMs'] != "":
                tmpTest.duration = float(event['Chirp_StashCp_DlTimeMs']) / 1000
                
            if 'Chirp_TransferSuccess' in event and event['Chirp_TransferSuccess'] == True:
                tmpTest.success = True
                
            if "Chirp_StashCp_Prefix" in event and event["Chirp_StashCp_Prefix"] != "":
                tmpTest.cache = event["Chirp_StashCp_Prefix"]
                
            tests["%i.%i" % (event['Cluster'], event['Proc']) ] = tmpTest.__dict__
    
    
    outputfile = "postprocess.%s.json" % site
    with open(outputfile, 'w') as f:
        f.write(json.dumps(tests.values()))
    
    return 0
Exemplo n.º 12
0
 def setUp(self):
     # Note we cannot use a temporary file here; the event reader
     # is based on *filenames* (which are not visible for TemporaryFile),
     # not file descriptors.
     self.testname = "tests/test_event_reader.log"
     self.testfile = open(self.testname, "w")
     self.testfile.write(open("tests/job.log", "r").read())
     self.testfile.flush()
     self.testfile = open(self.testname, "r")
     self.reader = htcondor.read_events(self.testfile)
     self.sampleEvent = { \
         'MyType': "JobImageSizeEvent",
         'EventTypeNumber': 6,
         'Subproc': 0,
         'Cluster': 23515,
         'Proc': 0,
         'MemoryUsage': 1, 
         'Size': 260,
         'ResidentSetSize': 252,
         'CurrentTime': 0,
         'EventTime': 0,
     }
     self.sampleEventText = SAMPLE_EVENT_TEXT